1 files changed, 119 insertions, 43 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e2a66e12fbc6..cfb48bd088e1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -202,7 +202,7 @@ struct async_list {
        struct file             *file;
        off_t                   io_end;
-        size_t                  io_pages;
+        size_t                  io_len;
 };
 struct io_ring_ctx {
@@ -333,7 +333,8 @@ struct io_kiocb {
 #define REQ_F_IO_DRAIN          16      /* drain existing IO first */
 #define REQ_F_IO_DRAINED        32      /* drain done */
 #define REQ_F_LINK              64      /* linked sqes */
-#define REQ_F_FAIL_LINK         128     /* fail rest of links */
+#define REQ_F_LINK_DONE         128     /* linked sqes done */
+#define REQ_F_FAIL_LINK         256     /* fail rest of links */
        u64                     user_data;
        u32                     result;
        u32                     sequence;
@@ -429,7 +430,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
        if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
                return false;
-        return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;
+        return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
 }
 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
@@ -632,6 +633,7 @@ static void io_req_link_next(struct io_kiocb *req)
                        nxt->flags |= REQ_F_LINK;
                }
+                nxt->flags |= REQ_F_LINK_DONE;
                INIT_WORK(&nxt->work, io_sq_wq_submit_work);
                queue_work(req->ctx->sqo_wq, &nxt->work);
        }
@@ -677,6 +679,13 @@ static void io_put_req(struct io_kiocb *req)
                io_free_req(req);
 }
+static unsigned io_cqring_events(struct io_cq_ring *ring)
+{
+        /* See comment at the top of this file */
+        smp_rmb();
+        return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
+}
 /*
 * Find and free completed poll iocbs
 */
@@ -769,7 +778,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
                                long min)
 {
-        while (!list_empty(&ctx->poll_list)) {
+        while (!list_empty(&ctx->poll_list) && !need_resched()) {
                int ret;
                ret = io_do_iopoll(ctx, nr_events, min);
@@ -796,6 +805,12 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
                unsigned int nr_events = 0;
                io_iopoll_getevents(ctx, &nr_events, 1);
+                /*
+                 * Ensure we allow local-to-the-cpu processing to take place,
+                 * in this case we need to ensure that we reap all events.
+                 */
+                cond_resched();
        }
        mutex_unlock(&ctx->uring_lock);
 }
@@ -803,11 +818,42 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
                           long min)
 {
-        int ret = 0;
+        int iters, ret = 0;
+        /*
+         * We disallow the app entering submit/complete with polling, but we
+         * still need to lock the ring to prevent racing with polled issue
+         * that got punted to a workqueue.
+         */
+        mutex_lock(&ctx->uring_lock);
+        iters = 0;
        do {
                int tmin = 0;
+                /*
+                 * Don't enter poll loop if we already have events pending.
+                 * If we do, we can potentially be spinning for commands that
+                 * already triggered a CQE (eg in error).
+                 */
+                if (io_cqring_events(ctx->cq_ring))
+                        break;
+                /*
+                 * If a submit got punted to a workqueue, we can have the
+                 * application entering polling for a command before it gets
+                 * issued. That app will hold the uring_lock for the duration
+                 * of the poll right here, so we need to take a breather every
+                 * now and then to ensure that the issue has a chance to add
+                 * the poll to the issued list. Otherwise we can spin here
+                 * forever, while the workqueue is stuck trying to acquire the
+                 * very same mutex.
+                 */
+                if (!(++iters & 7)) {
+                        mutex_unlock(&ctx->uring_lock);
+                        mutex_lock(&ctx->uring_lock);
+                }
                if (*nr_events < min)
                        tmin = min - *nr_events;
@@ -817,6 +863,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
                ret = 0;
        } while (min && !*nr_events && !need_resched());
+        mutex_unlock(&ctx->uring_lock);
        return ret;
 }
@@ -1064,8 +1111,42 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
         */
        offset = buf_addr - imu->ubuf;
        iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
-        if (offset)
-                iov_iter_advance(iter, offset);
+        if (offset) {
+                /*
+                 * Don't use iov_iter_advance() here, as it's really slow for
+                 * using the latter parts of a big fixed buffer - it iterates
+                 * over each segment manually. We can cheat a bit here, because
+                 * we know that:
+                 *
+                 * 1) it's a BVEC iter, we set it up
+                 * 2) all bvecs are PAGE_SIZE in size, except potentially the
+                 *    first and last bvec
+                 *
+                 * So just find our index, and adjust the iterator afterwards.
+                 * If the offset is within the first bvec (or the whole first
+                 * bvec, just use iov_iter_advance(). This makes it easier
+                 * since we can just skip the first segment, which may not
+                 * be PAGE_SIZE aligned.
+                 */
+                const struct bio_vec *bvec = imu->bvec;
+                if (offset <= bvec->bv_len) {
+                        iov_iter_advance(iter, offset);
+                } else {
+                        unsigned long seg_skip;
+                        /* skip first vec */
+                        offset -= bvec->bv_len;
+                        seg_skip = 1 + (offset >> PAGE_SHIFT);
+                        iter->bvec = bvec + seg_skip;
+                        iter->nr_segs -= seg_skip;
+                        iter->count -= bvec->bv_len + offset;
+                        iter->iov_offset = offset & ~PAGE_MASK;
+                }
+        }
        return 0;
 }
@@ -1120,28 +1201,26 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
        off_t io_end = kiocb->ki_pos + len;
        if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
-                unsigned long max_pages;
+                unsigned long max_bytes;
                /* Use 8x RA size as a decent limiter for both reads/writes */
-                max_pages = filp->f_ra.ra_pages;
+                max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
-                if (!max_pages)
+                if (!max_bytes)
-                        max_pages = VM_READAHEAD_PAGES;
+                        max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
-                max_pages *= 8;
+                /* If max len are exceeded, reset the state */
-                /* If max pages are exceeded, reset the state */
+                if (async_list->io_len + len <= max_bytes) {
-                len >>= PAGE_SHIFT;
-                if (async_list->io_pages + len <= max_pages) {
                        req->flags |= REQ_F_SEQ_PREV;
-                        async_list->io_pages += len;
+                        async_list->io_len += len;
                } else {
                        io_end = 0;
-                        async_list->io_pages = 0;
+                        async_list->io_len = 0;
                }
        }
        /* New file? Reset state. */
        if (async_list->file != filp) {
-                async_list->io_pages = 0;
+                async_list->io_len = 0;
                async_list->file = filp;
        }
        async_list->io_end = io_end;
@@ -1630,6 +1709,8 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        INIT_LIST_HEAD(&poll->wait.entry);
        init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+        INIT_LIST_HEAD(&req->list);
        mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
        spin_lock_irq(&ctx->completion_lock);
@@ -1800,6 +1881,7 @@ restart:
        do {
                struct sqe_submit *s = &req->submit;
                const struct io_uring_sqe *sqe = s->sqe;
+                unsigned int flags = req->flags;
                /* Ensure we clear previously set non-block flag */
                req->rw.ki_flags &= ~IOCB_NOWAIT;
@@ -1844,6 +1926,10 @@ restart:
                /* async context always use a copy of the sqe */
                kfree(sqe);
+                /* req from defer and link list needn't decrease async cnt */
+                if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
+                        goto out;
                if (!async_list)
                        break;
                if (!list_empty(&req_list)) {
@@ -1891,6 +1977,7 @@ restart:
                }
        }
+out:
        if (cur_mm) {
                set_fs(old_fs);
                unuse_mm(cur_mm);
@@ -1917,6 +2004,10 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
        ret = true;
        spin_lock(&list->lock);
        list_add_tail(&req->list, &list->list);
+        /*
+         * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
+         */
+        smp_mb();
        if (!atomic_read(&list->cnt)) {
                list_del_init(&req->list);
                ret = false;
@@ -1977,6 +2068,15 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 {
        int ret;
+        ret = io_req_defer(ctx, req, s->sqe);
+        if (ret) {
+                if (ret != -EIOCBQUEUED) {
+                        io_free_req(req);
+                        io_cqring_add_event(ctx, s->sqe->user_data, ret);
+                }
+                return 0;
+        }
        ret = __io_submit_sqe(ctx, req, s, true);
        if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
                struct io_uring_sqe *sqe_copy;
@@ -2049,13 +2149,6 @@ err:
                return;
        }
-        ret = io_req_defer(ctx, req, s->sqe);
-        if (ret) {
-                if (ret != -EIOCBQUEUED)
-                        goto err_req;
-                return;
-        }
        /*
         * If we already have a head request, queue this one for async
         * submittal once the head completes. If we don't have a head but
@@ -2232,15 +2325,7 @@ static int io_sq_thread(void *data)
                        unsigned nr_events = 0;
                        if (ctx->flags & IORING_SETUP_IOPOLL) {
-                                /*
-                                 * We disallow the app entering submit/complete
-                                 * with polling, but we still need to lock the
-                                 * ring to prevent racing with polled issue
-                                 * that got punted to a workqueue.
-                                 */
-                                mutex_lock(&ctx->uring_lock);
                                io_iopoll_check(ctx, &nr_events, 0);
-                                mutex_unlock(&ctx->uring_lock);
                        } else {
                                /*
                                 * Normal IO, just pretend everything completed.
@@ -2385,13 +2470,6 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
        return submit;
 }
-static unsigned io_cqring_events(struct io_cq_ring *ring)
-{
-        /* See comment at the top of this file */
-        smp_rmb();
-        return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
-}
 /*
 * Wait until events become available, if we don't already have some. The
 * application must reap them itself, as they reside on the shared cq ring.
@@ -3142,9 +3220,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
                min_complete = min(min_complete, ctx->cq_entries);
                if (ctx->flags & IORING_SETUP_IOPOLL) {
-                        mutex_lock(&ctx->uring_lock);
                        ret = io_iopoll_check(ctx, &nr_events, min_complete);
-                        mutex_unlock(&ctx->uring_lock);
                } else {
                        ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
                }

diff --git a/fs/io_uring.c b/fs/io_uring.c index e2a66e12fbc6..cfb48bd088e1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c
@@ -202,7 +202,7 @@ struct async_list {
202		202
203	struct file *file;	203	struct file *file;
204	off_t io_end;	204	off_t io_end;
205	size_t io_pages;	205	size_t io_len;
206	};	206	};
207		207
208	struct io_ring_ctx {	208	struct io_ring_ctx {
@@ -333,7 +333,8 @@ struct io_kiocb {
333	#define REQ_F_IO_DRAIN 16 /* drain existing IO first */	333	#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
334	#define REQ_F_IO_DRAINED 32 /* drain done */	334	#define REQ_F_IO_DRAINED 32 /* drain done */
335	#define REQ_F_LINK 64 /* linked sqes */	335	#define REQ_F_LINK 64 /* linked sqes */
336	#define REQ_F_FAIL_LINK 128 /* fail rest of links */	336	#define REQ_F_LINK_DONE 128 /* linked sqes done */
		337	#define REQ_F_FAIL_LINK 256 /* fail rest of links */
337	u64 user_data;	338	u64 user_data;
338	u32 result;	339	u32 result;
339	u32 sequence;	340	u32 sequence;
@@ -429,7 +430,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
429	if ((req->flags & (REQ_F_IO_DRAIN\|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)	430	if ((req->flags & (REQ_F_IO_DRAIN\|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
430	return false;	431	return false;
431		432
432	return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;	433	return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
433	}	434	}
434		435
435	static struct io_kiocb io_get_deferred_req(struct io_ring_ctx ctx)	436	static struct io_kiocb io_get_deferred_req(struct io_ring_ctx ctx)
@@ -632,6 +633,7 @@ static void io_req_link_next(struct io_kiocb *req)
632	nxt->flags \|= REQ_F_LINK;	633	nxt->flags \|= REQ_F_LINK;
633	}	634	}
634		635
		636	nxt->flags \|= REQ_F_LINK_DONE;
635	INIT_WORK(&nxt->work, io_sq_wq_submit_work);	637	INIT_WORK(&nxt->work, io_sq_wq_submit_work);
636	queue_work(req->ctx->sqo_wq, &nxt->work);	638	queue_work(req->ctx->sqo_wq, &nxt->work);
637	}	639	}
@@ -677,6 +679,13 @@ static void io_put_req(struct io_kiocb *req)
677	io_free_req(req);	679	io_free_req(req);
678	}	680	}
679		681
		682	static unsigned io_cqring_events(struct io_cq_ring *ring)
		683	{
		684	/* See comment at the top of this file */
		685	smp_rmb();
		686	return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
		687	}
		688
680	/*	689	/*
681	* Find and free completed poll iocbs	690	* Find and free completed poll iocbs
682	*/	691	*/
@@ -769,7 +778,7 @@ static int io_do_iopoll(struct io_ring_ctx ctx, unsigned int nr_events,
769	static int io_iopoll_getevents(struct io_ring_ctx ctx, unsigned int nr_events,	778	static int io_iopoll_getevents(struct io_ring_ctx ctx, unsigned int nr_events,
770	long min)	779	long min)
771	{	780	{
772	while (!list_empty(&ctx->poll_list)) {	781	while (!list_empty(&ctx->poll_list) && !need_resched()) {
773	int ret;	782	int ret;
774		783
775	ret = io_do_iopoll(ctx, nr_events, min);	784	ret = io_do_iopoll(ctx, nr_events, min);
@@ -796,6 +805,12 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
796	unsigned int nr_events = 0;	805	unsigned int nr_events = 0;
797		806
798	io_iopoll_getevents(ctx, &nr_events, 1);	807	io_iopoll_getevents(ctx, &nr_events, 1);
		808
		809	/*
		810	* Ensure we allow local-to-the-cpu processing to take place,
		811	* in this case we need to ensure that we reap all events.
		812	*/
		813	cond_resched();
799	}	814	}
800	mutex_unlock(&ctx->uring_lock);	815	mutex_unlock(&ctx->uring_lock);
801	}	816	}
@@ -803,11 +818,42 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
803	static int io_iopoll_check(struct io_ring_ctx ctx, unsigned nr_events,	818	static int io_iopoll_check(struct io_ring_ctx ctx, unsigned nr_events,
804	long min)	819	long min)
805	{	820	{
806	int ret = 0;	821	int iters, ret = 0;
		822
		823	/*
		824	* We disallow the app entering submit/complete with polling, but we
		825	* still need to lock the ring to prevent racing with polled issue
		826	* that got punted to a workqueue.
		827	*/
		828	mutex_lock(&ctx->uring_lock);
807		829
		830	iters = 0;
808	do {	831	do {
809	int tmin = 0;	832	int tmin = 0;
810		833
		834	/*
		835	* Don't enter poll loop if we already have events pending.
		836	* If we do, we can potentially be spinning for commands that
		837	* already triggered a CQE (eg in error).
		838	*/
		839	if (io_cqring_events(ctx->cq_ring))
		840	break;
		841
		842	/*
		843	* If a submit got punted to a workqueue, we can have the
		844	* application entering polling for a command before it gets
		845	* issued. That app will hold the uring_lock for the duration
		846	* of the poll right here, so we need to take a breather every
		847	* now and then to ensure that the issue has a chance to add
		848	* the poll to the issued list. Otherwise we can spin here
		849	* forever, while the workqueue is stuck trying to acquire the
		850	* very same mutex.
		851	*/
		852	if (!(++iters & 7)) {
		853	mutex_unlock(&ctx->uring_lock);
		854	mutex_lock(&ctx->uring_lock);
		855	}
		856
811	if (*nr_events < min)	857	if (*nr_events < min)
812	tmin = min - *nr_events;	858	tmin = min - *nr_events;
813		859
@@ -817,6 +863,7 @@ static int io_iopoll_check(struct io_ring_ctx ctx, unsigned nr_events,
817	ret = 0;	863	ret = 0;
818	} while (min && !*nr_events && !need_resched());	864	} while (min && !*nr_events && !need_resched());
819		865
		866	mutex_unlock(&ctx->uring_lock);
820	return ret;	867	return ret;
821	}	868	}
822		869
@@ -1064,8 +1111,42 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
1064	*/	1111	*/
1065	offset = buf_addr - imu->ubuf;	1112	offset = buf_addr - imu->ubuf;
1066	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);	1113	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
1067	if (offset)	1114
1068	iov_iter_advance(iter, offset);	1115	if (offset) {
		1116	/*
		1117	* Don't use iov_iter_advance() here, as it's really slow for
		1118	* using the latter parts of a big fixed buffer - it iterates
		1119	* over each segment manually. We can cheat a bit here, because
		1120	* we know that:
		1121	*
		1122	* 1) it's a BVEC iter, we set it up
		1123	* 2) all bvecs are PAGE_SIZE in size, except potentially the
		1124	* first and last bvec
		1125	*
		1126	* So just find our index, and adjust the iterator afterwards.
		1127	* If the offset is within the first bvec (or the whole first
		1128	* bvec, just use iov_iter_advance(). This makes it easier
		1129	* since we can just skip the first segment, which may not
		1130	* be PAGE_SIZE aligned.
		1131	*/
		1132	const struct bio_vec *bvec = imu->bvec;
		1133
		1134	if (offset <= bvec->bv_len) {
		1135	iov_iter_advance(iter, offset);
		1136	} else {
		1137	unsigned long seg_skip;
		1138
		1139	/* skip first vec */
		1140	offset -= bvec->bv_len;
		1141	seg_skip = 1 + (offset >> PAGE_SHIFT);
		1142
		1143	iter->bvec = bvec + seg_skip;
		1144	iter->nr_segs -= seg_skip;
		1145	iter->count -= bvec->bv_len + offset;
		1146	iter->iov_offset = offset & ~PAGE_MASK;
		1147	}
		1148	}
		1149
1069	return 0;	1150	return 0;
1070	}	1151	}
1071		1152
@@ -1120,28 +1201,26 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
1120	off_t io_end = kiocb->ki_pos + len;	1201	off_t io_end = kiocb->ki_pos + len;
1121		1202
1122	if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {	1203	if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
1123	unsigned long max_pages;	1204	unsigned long max_bytes;
1124		1205
1125	/* Use 8x RA size as a decent limiter for both reads/writes */	1206	/* Use 8x RA size as a decent limiter for both reads/writes */
1126	max_pages = filp->f_ra.ra_pages;	1207	max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
1127	if (!max_pages)	1208	if (!max_bytes)
1128	max_pages = VM_READAHEAD_PAGES;	1209	max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
1129	max_pages *= 8;	1210
1130		1211	/* If max len are exceeded, reset the state */
1131	/* If max pages are exceeded, reset the state */	1212	if (async_list->io_len + len <= max_bytes) {
1132	len >>= PAGE_SHIFT;
1133	if (async_list->io_pages + len <= max_pages) {
1134	req->flags \|= REQ_F_SEQ_PREV;	1213	req->flags \|= REQ_F_SEQ_PREV;
1135	async_list->io_pages += len;	1214	async_list->io_len += len;
1136	} else {	1215	} else {
1137	io_end = 0;	1216	io_end = 0;
1138	async_list->io_pages = 0;	1217	async_list->io_len = 0;
1139	}	1218	}
1140	}	1219	}
1141		1220
1142	/* New file? Reset state. */	1221	/* New file? Reset state. */
1143	if (async_list->file != filp) {	1222	if (async_list->file != filp) {
1144	async_list->io_pages = 0;	1223	async_list->io_len = 0;
1145	async_list->file = filp;	1224	async_list->file = filp;
1146	}	1225	}
1147	async_list->io_end = io_end;	1226	async_list->io_end = io_end;
@@ -1630,6 +1709,8 @@ static int io_poll_add(struct io_kiocb req, const struct io_uring_sqe sqe)
1630	INIT_LIST_HEAD(&poll->wait.entry);	1709	INIT_LIST_HEAD(&poll->wait.entry);
1631	init_waitqueue_func_entry(&poll->wait, io_poll_wake);	1710	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1632		1711
		1712	INIT_LIST_HEAD(&req->list);
		1713
1633	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;	1714	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1634		1715
1635	spin_lock_irq(&ctx->completion_lock);	1716	spin_lock_irq(&ctx->completion_lock);
@@ -1800,6 +1881,7 @@ restart:
1800	do {	1881	do {
1801	struct sqe_submit *s = &req->submit;	1882	struct sqe_submit *s = &req->submit;
1802	const struct io_uring_sqe *sqe = s->sqe;	1883	const struct io_uring_sqe *sqe = s->sqe;
		1884	unsigned int flags = req->flags;
1803		1885
1804	/* Ensure we clear previously set non-block flag */	1886	/* Ensure we clear previously set non-block flag */
1805	req->rw.ki_flags &= ~IOCB_NOWAIT;	1887	req->rw.ki_flags &= ~IOCB_NOWAIT;
@@ -1844,6 +1926,10 @@ restart:
1844	/* async context always use a copy of the sqe */	1926	/* async context always use a copy of the sqe */
1845	kfree(sqe);	1927	kfree(sqe);
1846		1928
		1929	/* req from defer and link list needn't decrease async cnt */
		1930	if (flags & (REQ_F_IO_DRAINED \| REQ_F_LINK_DONE))
		1931	goto out;
		1932
1847	if (!async_list)	1933	if (!async_list)
1848	break;	1934	break;
1849	if (!list_empty(&req_list)) {	1935	if (!list_empty(&req_list)) {
@@ -1891,6 +1977,7 @@ restart:
1891	}	1977	}
1892	}	1978	}
1893		1979
		1980	out:
1894	if (cur_mm) {	1981	if (cur_mm) {
1895	set_fs(old_fs);	1982	set_fs(old_fs);
1896	unuse_mm(cur_mm);	1983	unuse_mm(cur_mm);
@@ -1917,6 +2004,10 @@ static bool io_add_to_prev_work(struct async_list list, struct io_kiocb req)
1917	ret = true;	2004	ret = true;
1918	spin_lock(&list->lock);	2005	spin_lock(&list->lock);
1919	list_add_tail(&req->list, &list->list);	2006	list_add_tail(&req->list, &list->list);
		2007	/*
		2008	* Ensure we see a simultaneous modification from io_sq_wq_submit_work()
		2009	*/
		2010	smp_mb();
1920	if (!atomic_read(&list->cnt)) {	2011	if (!atomic_read(&list->cnt)) {
1921	list_del_init(&req->list);	2012	list_del_init(&req->list);
1922	ret = false;	2013	ret = false;
@@ -1977,6 +2068,15 @@ static int io_queue_sqe(struct io_ring_ctx ctx, struct io_kiocb req,
1977	{	2068	{
1978	int ret;	2069	int ret;
1979		2070
		2071	ret = io_req_defer(ctx, req, s->sqe);
		2072	if (ret) {
		2073	if (ret != -EIOCBQUEUED) {
		2074	io_free_req(req);
		2075	io_cqring_add_event(ctx, s->sqe->user_data, ret);
		2076	}
		2077	return 0;
		2078	}
		2079
1980	ret = __io_submit_sqe(ctx, req, s, true);	2080	ret = __io_submit_sqe(ctx, req, s, true);
1981	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {	2081	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
1982	struct io_uring_sqe *sqe_copy;	2082	struct io_uring_sqe *sqe_copy;
@@ -2049,13 +2149,6 @@ err:
2049	return;	2149	return;
2050	}	2150	}
2051		2151
2052	ret = io_req_defer(ctx, req, s->sqe);
2053	if (ret) {
2054	if (ret != -EIOCBQUEUED)
2055	goto err_req;
2056	return;
2057	}
2058
2059	/*	2152	/*
2060	* If we already have a head request, queue this one for async	2153	* If we already have a head request, queue this one for async
2061	* submittal once the head completes. If we don't have a head but	2154	* submittal once the head completes. If we don't have a head but
@@ -2232,15 +2325,7 @@ static int io_sq_thread(void *data)
2232	unsigned nr_events = 0;	2325	unsigned nr_events = 0;
2233		2326
2234	if (ctx->flags & IORING_SETUP_IOPOLL) {	2327	if (ctx->flags & IORING_SETUP_IOPOLL) {
2235	/*
2236	* We disallow the app entering submit/complete
2237	* with polling, but we still need to lock the
2238	* ring to prevent racing with polled issue
2239	* that got punted to a workqueue.
2240	*/
2241	mutex_lock(&ctx->uring_lock);
2242	io_iopoll_check(ctx, &nr_events, 0);	2328	io_iopoll_check(ctx, &nr_events, 0);
2243	mutex_unlock(&ctx->uring_lock);
2244	} else {	2329	} else {
2245	/*	2330	/*
2246	* Normal IO, just pretend everything completed.	2331	* Normal IO, just pretend everything completed.
@@ -2385,13 +2470,6 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
2385	return submit;	2470	return submit;
2386	}	2471	}
2387		2472
2388	static unsigned io_cqring_events(struct io_cq_ring *ring)
2389	{
2390	/* See comment at the top of this file */
2391	smp_rmb();
2392	return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
2393	}
2394
2395	/*	2473	/*
2396	* Wait until events become available, if we don't already have some. The	2474	* Wait until events become available, if we don't already have some. The
2397	* application must reap them itself, as they reside on the shared cq ring.	2475	* application must reap them itself, as they reside on the shared cq ring.
@@ -3142,9 +3220,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
3142	min_complete = min(min_complete, ctx->cq_entries);	3220	min_complete = min(min_complete, ctx->cq_entries);
3143		3221
3144	if (ctx->flags & IORING_SETUP_IOPOLL) {	3222	if (ctx->flags & IORING_SETUP_IOPOLL) {
3145	mutex_lock(&ctx->uring_lock);
3146	ret = io_iopoll_check(ctx, &nr_events, min_complete);	3223	ret = io_iopoll_check(ctx, &nr_events, min_complete);
3147	mutex_unlock(&ctx->uring_lock);
3148	} else {	3224	} else {
3149	ret = io_cqring_wait(ctx, min_complete, sig, sigsz);	3225	ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
3150	}	3226	}