Merge branch 'odp_fixes' into hmm.git

From rdma.git Jason Gunthorpe says: ==================== This is a collection of general cleanups for ODP to clarify some of the flows around umem creation and use of the interval tree. ==================== The branch is based on v5.3-rc5 due to dependencies, and is being taken into hmm.git due to dependencies in the next patches. * odp_fixes: RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr RDMA/mlx5: Use ib_umem_start instead of umem.address RDMA/core: Make invalidate_range a device operation RDMA/odp: Use kvcalloc for the dma_list and page_list RDMA/odp: Check for overflow when computing the umem_odp end RDMA/odp: Provide ib_umem_odp_release() to undo the allocs RDMA/odp: Split creating a umem_odp from ib_umem_get RDMA/odp: Make the three ways to create a umem_odp clear RMDA/odp: Consolidate umem_odp initialization RDMA/odp: Make it clearer when a umem is an implicit ODP umem RDMA/odp: Iterate over the whole rbtree directly RDMA/odp: Use the common interval tree library instead of generic RDMA/mlx5: Fix MR npages calculation for IB_ACCESS_HUGETLB Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
author: Jason Gunthorpe <jgg@mellanox.com> 2019-08-21 13:12:29 -0400
committer: Jason Gunthorpe <jgg@mellanox.com> 2019-08-21 19:58:18 -0400
commit: daa138a58c802e7b4c2fb73f9b85bb082616ef43 (patch)
tree: be913e8e3745bb367d2ba371598f447649102cfc /fs/io_uring.c
parent: 6869b7b206595ae0e326f59719090351eb8f4f5d (diff)
parent: fba0e448a2c5b297a4ddc1ec4e48f4aa6600a1c9 (diff)
1 files changed, 72 insertions, 24 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e2a66e12fbc6..24bbe3cb7ad4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -202,7 +202,7 @@ struct async_list {
        struct file             *file;
        off_t                   io_end;
-        size_t                  io_pages;
+        size_t                  io_len;
 };
 struct io_ring_ctx {
@@ -333,7 +333,8 @@ struct io_kiocb {
 #define REQ_F_IO_DRAIN          16      /* drain existing IO first */
 #define REQ_F_IO_DRAINED        32      /* drain done */
 #define REQ_F_LINK              64      /* linked sqes */
-#define REQ_F_FAIL_LINK         128     /* fail rest of links */
+#define REQ_F_LINK_DONE         128     /* linked sqes done */
+#define REQ_F_FAIL_LINK         256     /* fail rest of links */
        u64                     user_data;
        u32                     result;
        u32                     sequence;
@@ -429,7 +430,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
        if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
                return false;
-        return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;
+        return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
 }
 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
@@ -632,6 +633,7 @@ static void io_req_link_next(struct io_kiocb *req)
                        nxt->flags |= REQ_F_LINK;
                }
+                nxt->flags |= REQ_F_LINK_DONE;
                INIT_WORK(&nxt->work, io_sq_wq_submit_work);
                queue_work(req->ctx->sqo_wq, &nxt->work);
        }
@@ -1064,8 +1066,42 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
         */
        offset = buf_addr - imu->ubuf;
        iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
-        if (offset)
-                iov_iter_advance(iter, offset);
+        if (offset) {
+                /*
+                 * Don't use iov_iter_advance() here, as it's really slow for
+                 * using the latter parts of a big fixed buffer - it iterates
+                 * over each segment manually. We can cheat a bit here, because
+                 * we know that:
+                 *
+                 * 1) it's a BVEC iter, we set it up
+                 * 2) all bvecs are PAGE_SIZE in size, except potentially the
+                 *    first and last bvec
+                 *
+                 * So just find our index, and adjust the iterator afterwards.
+                 * If the offset is within the first bvec (or the whole first
+                 * bvec, just use iov_iter_advance(). This makes it easier
+                 * since we can just skip the first segment, which may not
+                 * be PAGE_SIZE aligned.
+                 */
+                const struct bio_vec *bvec = imu->bvec;
+                if (offset <= bvec->bv_len) {
+                        iov_iter_advance(iter, offset);
+                } else {
+                        unsigned long seg_skip;
+                        /* skip first vec */
+                        offset -= bvec->bv_len;
+                        seg_skip = 1 + (offset >> PAGE_SHIFT);
+                        iter->bvec = bvec + seg_skip;
+                        iter->nr_segs -= seg_skip;
+                        iter->count -= bvec->bv_len + offset;
+                        iter->iov_offset = offset & ~PAGE_MASK;
+                }
+        }
        return 0;
 }
@@ -1120,28 +1156,26 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
        off_t io_end = kiocb->ki_pos + len;
        if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
-                unsigned long max_pages;
+                unsigned long max_bytes;
                /* Use 8x RA size as a decent limiter for both reads/writes */
-                max_pages = filp->f_ra.ra_pages;
+                max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
-                if (!max_pages)
+                if (!max_bytes)
-                        max_pages = VM_READAHEAD_PAGES;
+                        max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
-                max_pages *= 8;
+                /* If max len are exceeded, reset the state */
-                /* If max pages are exceeded, reset the state */
+                if (async_list->io_len + len <= max_bytes) {
-                len >>= PAGE_SHIFT;
-                if (async_list->io_pages + len <= max_pages) {
                        req->flags |= REQ_F_SEQ_PREV;
-                        async_list->io_pages += len;
+                        async_list->io_len += len;
                } else {
                        io_end = 0;
-                        async_list->io_pages = 0;
+                        async_list->io_len = 0;
                }
        }
        /* New file? Reset state. */
        if (async_list->file != filp) {
-                async_list->io_pages = 0;
+                async_list->io_len = 0;
                async_list->file = filp;
        }
        async_list->io_end = io_end;
@@ -1630,6 +1664,8 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        INIT_LIST_HEAD(&poll->wait.entry);
        init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+        INIT_LIST_HEAD(&req->list);
        mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
        spin_lock_irq(&ctx->completion_lock);
@@ -1800,6 +1836,7 @@ restart:
        do {
                struct sqe_submit *s = &req->submit;
                const struct io_uring_sqe *sqe = s->sqe;
+                unsigned int flags = req->flags;
                /* Ensure we clear previously set non-block flag */
                req->rw.ki_flags &= ~IOCB_NOWAIT;
@@ -1844,6 +1881,10 @@ restart:
                /* async context always use a copy of the sqe */
                kfree(sqe);
+                /* req from defer and link list needn't decrease async cnt */
+                if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
+                        goto out;
                if (!async_list)
                        break;
                if (!list_empty(&req_list)) {
@@ -1891,6 +1932,7 @@ restart:
                }
        }
+out:
        if (cur_mm) {
                set_fs(old_fs);
                unuse_mm(cur_mm);
@@ -1917,6 +1959,10 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
        ret = true;
        spin_lock(&list->lock);
        list_add_tail(&req->list, &list->list);
+        /*
+         * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
+         */
+        smp_mb();
        if (!atomic_read(&list->cnt)) {
                list_del_init(&req->list);
                ret = false;
@@ -1977,6 +2023,15 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 {
        int ret;
+        ret = io_req_defer(ctx, req, s->sqe);
+        if (ret) {
+                if (ret != -EIOCBQUEUED) {
+                        io_free_req(req);
+                        io_cqring_add_event(ctx, s->sqe->user_data, ret);
+                }
+                return 0;
+        }
        ret = __io_submit_sqe(ctx, req, s, true);
        if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
                struct io_uring_sqe *sqe_copy;
@@ -2049,13 +2104,6 @@ err:
                return;
        }
-        ret = io_req_defer(ctx, req, s->sqe);
-        if (ret) {
-                if (ret != -EIOCBQUEUED)
-                        goto err_req;
-                return;
-        }
        /*
         * If we already have a head request, queue this one for async
         * submittal once the head completes. If we don't have a head but
author	Jason Gunthorpe <jgg@mellanox.com>	2019-08-21 13:12:29 -0400
committer	Jason Gunthorpe <jgg@mellanox.com>	2019-08-21 19:58:18 -0400
commit	daa138a58c802e7b4c2fb73f9b85bb082616ef43 (patch)
tree	be913e8e3745bb367d2ba371598f447649102cfc /fs/io_uring.c
parent	6869b7b206595ae0e326f59719090351eb8f4f5d (diff)
parent	fba0e448a2c5b297a4ddc1ec4e48f4aa6600a1c9 (diff)

diff --git a/fs/io_uring.c b/fs/io_uring.c index e2a66e12fbc6..24bbe3cb7ad4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c
@@ -202,7 +202,7 @@ struct async_list {
202		202
203	struct file *file;	203	struct file *file;
204	off_t io_end;	204	off_t io_end;
205	size_t io_pages;	205	size_t io_len;
206	};	206	};
207		207
208	struct io_ring_ctx {	208	struct io_ring_ctx {
@@ -333,7 +333,8 @@ struct io_kiocb {
333	#define REQ_F_IO_DRAIN 16 /* drain existing IO first */	333	#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
334	#define REQ_F_IO_DRAINED 32 /* drain done */	334	#define REQ_F_IO_DRAINED 32 /* drain done */
335	#define REQ_F_LINK 64 /* linked sqes */	335	#define REQ_F_LINK 64 /* linked sqes */
336	#define REQ_F_FAIL_LINK 128 /* fail rest of links */	336	#define REQ_F_LINK_DONE 128 /* linked sqes done */
		337	#define REQ_F_FAIL_LINK 256 /* fail rest of links */
337	u64 user_data;	338	u64 user_data;
338	u32 result;	339	u32 result;
339	u32 sequence;	340	u32 sequence;
@@ -429,7 +430,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
429	if ((req->flags & (REQ_F_IO_DRAIN\|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)	430	if ((req->flags & (REQ_F_IO_DRAIN\|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
430	return false;	431	return false;
431		432
432	return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;	433	return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
433	}	434	}
434		435
435	static struct io_kiocb io_get_deferred_req(struct io_ring_ctx ctx)	436	static struct io_kiocb io_get_deferred_req(struct io_ring_ctx ctx)
@@ -632,6 +633,7 @@ static void io_req_link_next(struct io_kiocb *req)
632	nxt->flags \|= REQ_F_LINK;	633	nxt->flags \|= REQ_F_LINK;
633	}	634	}
634		635
		636	nxt->flags \|= REQ_F_LINK_DONE;
635	INIT_WORK(&nxt->work, io_sq_wq_submit_work);	637	INIT_WORK(&nxt->work, io_sq_wq_submit_work);
636	queue_work(req->ctx->sqo_wq, &nxt->work);	638	queue_work(req->ctx->sqo_wq, &nxt->work);
637	}	639	}
@@ -1064,8 +1066,42 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
1064	*/	1066	*/
1065	offset = buf_addr - imu->ubuf;	1067	offset = buf_addr - imu->ubuf;
1066	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);	1068	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
1067	if (offset)	1069
1068	iov_iter_advance(iter, offset);	1070	if (offset) {
		1071	/*
		1072	* Don't use iov_iter_advance() here, as it's really slow for
		1073	* using the latter parts of a big fixed buffer - it iterates
		1074	* over each segment manually. We can cheat a bit here, because
		1075	* we know that:
		1076	*
		1077	* 1) it's a BVEC iter, we set it up
		1078	* 2) all bvecs are PAGE_SIZE in size, except potentially the
		1079	* first and last bvec
		1080	*
		1081	* So just find our index, and adjust the iterator afterwards.
		1082	* If the offset is within the first bvec (or the whole first
		1083	* bvec, just use iov_iter_advance(). This makes it easier
		1084	* since we can just skip the first segment, which may not
		1085	* be PAGE_SIZE aligned.
		1086	*/
		1087	const struct bio_vec *bvec = imu->bvec;
		1088
		1089	if (offset <= bvec->bv_len) {
		1090	iov_iter_advance(iter, offset);
		1091	} else {
		1092	unsigned long seg_skip;
		1093
		1094	/* skip first vec */
		1095	offset -= bvec->bv_len;
		1096	seg_skip = 1 + (offset >> PAGE_SHIFT);
		1097
		1098	iter->bvec = bvec + seg_skip;
		1099	iter->nr_segs -= seg_skip;
		1100	iter->count -= bvec->bv_len + offset;
		1101	iter->iov_offset = offset & ~PAGE_MASK;
		1102	}
		1103	}
		1104
1069	return 0;	1105	return 0;
1070	}	1106	}
1071		1107
@@ -1120,28 +1156,26 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
1120	off_t io_end = kiocb->ki_pos + len;	1156	off_t io_end = kiocb->ki_pos + len;
1121		1157
1122	if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {	1158	if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
1123	unsigned long max_pages;	1159	unsigned long max_bytes;
1124		1160
1125	/* Use 8x RA size as a decent limiter for both reads/writes */	1161	/* Use 8x RA size as a decent limiter for both reads/writes */
1126	max_pages = filp->f_ra.ra_pages;	1162	max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
1127	if (!max_pages)	1163	if (!max_bytes)
1128	max_pages = VM_READAHEAD_PAGES;	1164	max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
1129	max_pages *= 8;	1165
1130		1166	/* If max len are exceeded, reset the state */
1131	/* If max pages are exceeded, reset the state */	1167	if (async_list->io_len + len <= max_bytes) {
1132	len >>= PAGE_SHIFT;
1133	if (async_list->io_pages + len <= max_pages) {
1134	req->flags \|= REQ_F_SEQ_PREV;	1168	req->flags \|= REQ_F_SEQ_PREV;
1135	async_list->io_pages += len;	1169	async_list->io_len += len;
1136	} else {	1170	} else {
1137	io_end = 0;	1171	io_end = 0;
1138	async_list->io_pages = 0;	1172	async_list->io_len = 0;
1139	}	1173	}
1140	}	1174	}
1141		1175
1142	/* New file? Reset state. */	1176	/* New file? Reset state. */
1143	if (async_list->file != filp) {	1177	if (async_list->file != filp) {
1144	async_list->io_pages = 0;	1178	async_list->io_len = 0;
1145	async_list->file = filp;	1179	async_list->file = filp;
1146	}	1180	}
1147	async_list->io_end = io_end;	1181	async_list->io_end = io_end;
@@ -1630,6 +1664,8 @@ static int io_poll_add(struct io_kiocb req, const struct io_uring_sqe sqe)
1630	INIT_LIST_HEAD(&poll->wait.entry);	1664	INIT_LIST_HEAD(&poll->wait.entry);
1631	init_waitqueue_func_entry(&poll->wait, io_poll_wake);	1665	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1632		1666
		1667	INIT_LIST_HEAD(&req->list);
		1668
1633	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;	1669	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1634		1670
1635	spin_lock_irq(&ctx->completion_lock);	1671	spin_lock_irq(&ctx->completion_lock);
@@ -1800,6 +1836,7 @@ restart:
1800	do {	1836	do {
1801	struct sqe_submit *s = &req->submit;	1837	struct sqe_submit *s = &req->submit;
1802	const struct io_uring_sqe *sqe = s->sqe;	1838	const struct io_uring_sqe *sqe = s->sqe;
		1839	unsigned int flags = req->flags;
1803		1840
1804	/* Ensure we clear previously set non-block flag */	1841	/* Ensure we clear previously set non-block flag */
1805	req->rw.ki_flags &= ~IOCB_NOWAIT;	1842	req->rw.ki_flags &= ~IOCB_NOWAIT;
@@ -1844,6 +1881,10 @@ restart:
1844	/* async context always use a copy of the sqe */	1881	/* async context always use a copy of the sqe */
1845	kfree(sqe);	1882	kfree(sqe);
1846		1883
		1884	/* req from defer and link list needn't decrease async cnt */
		1885	if (flags & (REQ_F_IO_DRAINED \| REQ_F_LINK_DONE))
		1886	goto out;
		1887
1847	if (!async_list)	1888	if (!async_list)
1848	break;	1889	break;
1849	if (!list_empty(&req_list)) {	1890	if (!list_empty(&req_list)) {
@@ -1891,6 +1932,7 @@ restart:
1891	}	1932	}
1892	}	1933	}
1893		1934
		1935	out:
1894	if (cur_mm) {	1936	if (cur_mm) {
1895	set_fs(old_fs);	1937	set_fs(old_fs);
1896	unuse_mm(cur_mm);	1938	unuse_mm(cur_mm);
@@ -1917,6 +1959,10 @@ static bool io_add_to_prev_work(struct async_list list, struct io_kiocb req)
1917	ret = true;	1959	ret = true;
1918	spin_lock(&list->lock);	1960	spin_lock(&list->lock);
1919	list_add_tail(&req->list, &list->list);	1961	list_add_tail(&req->list, &list->list);
		1962	/*
		1963	* Ensure we see a simultaneous modification from io_sq_wq_submit_work()
		1964	*/
		1965	smp_mb();
1920	if (!atomic_read(&list->cnt)) {	1966	if (!atomic_read(&list->cnt)) {
1921	list_del_init(&req->list);	1967	list_del_init(&req->list);
1922	ret = false;	1968	ret = false;
@@ -1977,6 +2023,15 @@ static int io_queue_sqe(struct io_ring_ctx ctx, struct io_kiocb req,
1977	{	2023	{
1978	int ret;	2024	int ret;
1979		2025
		2026	ret = io_req_defer(ctx, req, s->sqe);
		2027	if (ret) {
		2028	if (ret != -EIOCBQUEUED) {
		2029	io_free_req(req);
		2030	io_cqring_add_event(ctx, s->sqe->user_data, ret);
		2031	}
		2032	return 0;
		2033	}
		2034
1980	ret = __io_submit_sqe(ctx, req, s, true);	2035	ret = __io_submit_sqe(ctx, req, s, true);
1981	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {	2036	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
1982	struct io_uring_sqe *sqe_copy;	2037	struct io_uring_sqe *sqe_copy;
@@ -2049,13 +2104,6 @@ err:
2049	return;	2104	return;
2050	}	2105	}
2051		2106
2052	ret = io_req_defer(ctx, req, s->sqe);
2053	if (ret) {
2054	if (ret != -EIOCBQUEUED)
2055	goto err_req;
2056	return;
2057	}
2058
2059	/*	2107	/*
2060	* If we already have a head request, queue this one for async	2108	* If we already have a head request, queue this one for async
2061	* submittal once the head completes. If we don't have a head but	2109	* submittal once the head completes. If we don't have a head but