drm/i915/cmdparser: Use cached vmappings

The single largest factor in the overhead of parsing the commands is the setup of the virtual mapping to provide a continuous block for the batch buffer. If we keep those vmappings around (against the better judgement of mm/vmalloc.c, which we offset by handwaving and looking suggestively at the shrinker) we can dramatically improve the performance of the parser for small batches (such as media workloads). Furthermore, we can use the prepare shmem read/write functions to determine how best we need to clflush the range (rather than every page of the object). The impact of caching both src/dst vmaps is +80% on ivb and +140% on byt for the throughput on small batches. (Caching just the dst vmap and iterating over the src, doing a page by page copy is roughly 5% slower on both platforms. That may be an acceptable trade-off to eliminate one cached vmapping, and we may be able to reduce the per-page copying overhead further.) For *this* simple test case, the cmdparser is now within a factor of 2 of ideal performance. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.william.auld@gmail.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-33-chris@chris-wilson.co.uk
author: Chris Wilson <chris@chris-wilson.co.uk> 2016-08-18 12:17:12 -0400
committer: Chris Wilson <chris@chris-wilson.co.uk> 2016-08-18 17:36:59 -0400
commit: 0b5372727be37944239100ff05a63df9771c8484 (patch)
tree: edbad8a45469452c799f030ff601073b7ee9c346 /drivers/gpu
parent: 068715b922a6f87c454cdfa15bb8049d2076eee6 (diff)
2 files changed, 54 insertions, 81 deletions
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 8ebc0ce44a76..5d9ea163d1c8 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -937,98 +937,63 @@ find_reg_in_tables(const struct drm_i915_reg_table *tables,
        return NULL;
 }
-static u32 *vmap_batch(struct drm_i915_gem_object *obj,
+/* Returns a vmap'd pointer to dst_obj, which the caller must unmap */
-                       unsigned start, unsigned len)
+static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
-{
-        int i;
-        void *addr = NULL;
-        struct sg_page_iter sg_iter;
-        int first_page = start >> PAGE_SHIFT;
-        int last_page = (len + start + 4095) >> PAGE_SHIFT;
-        int npages = last_page - first_page;
-        struct page **pages;
-        pages = drm_malloc_ab(npages, sizeof(*pages));
-        if (pages == NULL) {
-                DRM_DEBUG_DRIVER("Failed to get space for pages\n");
-                goto finish;
-        }
-        i = 0;
-        for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
-                pages[i++] = sg_page_iter_page(&sg_iter);
-                if (i == npages)
-                        break;
-        }
-        addr = vmap(pages, i, 0, PAGE_KERNEL);
-        if (addr == NULL) {
-                DRM_DEBUG_DRIVER("Failed to vmap pages\n");
-                goto finish;
-        }
-finish:
-        if (pages)
-                drm_free_large(pages);
-        return (u32*)addr;
-}
-/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
-static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
                       struct drm_i915_gem_object *src_obj,
                       u32 batch_start_offset,
-                       u32 batch_len)
+                       u32 batch_len,
+                       bool *needs_clflush_after)
 {
-        unsigned int needs_clflush;
+        unsigned int src_needs_clflush;
-        void *src_base, *src;
+        unsigned int dst_needs_clflush;
-        void *dst = NULL;
+        void *src, *dst;
        int ret;
-        if (batch_len > dest_obj->base.size ||
+        ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush);
-            batch_len + batch_start_offset > src_obj->base.size)
+        if (ret)
-                return ERR_PTR(-E2BIG);
-        if (WARN_ON(dest_obj->pages_pin_count == 0))
-                return ERR_PTR(-ENODEV);
-        ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
-        if (ret) {
-                DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
                return ERR_PTR(ret);
-        }
-        src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
+        ret = i915_gem_obj_prepare_shmem_write(dst_obj, &dst_needs_clflush);
-        if (!src_base) {
+        if (ret) {
-                DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
+                dst = ERR_PTR(ret);
-                ret = -ENOMEM;
                goto unpin_src;
        }
-        ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
+        src = i915_gem_object_pin_map(src_obj, I915_MAP_WB);
-        if (ret) {
+        if (IS_ERR(src)) {
-                DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
+                dst = src;
-                goto unmap_src;
+                goto unpin_dst;
        }
-        dst = vmap_batch(dest_obj, 0, batch_len);
+        dst = i915_gem_object_pin_map(dst_obj, I915_MAP_WB);
-        if (!dst) {
+        if (IS_ERR(dst))
-                DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
-                ret = -ENOMEM;
                goto unmap_src;
-        }
-        src = src_base + offset_in_page(batch_start_offset);
+        src += batch_start_offset;
-        if (needs_clflush)
+        if (src_needs_clflush)
                drm_clflush_virt_range(src, batch_len);
+        /* We can avoid clflushing partial cachelines before the write if we
+         * only every write full cache-lines. Since we know that both the
+         * source and destination are in multiples of PAGE_SIZE, we can simply
+         * round up to the next cacheline. We don't care about copying too much
+         * here as we only validate up to the end of the batch.
+         */
+        if (dst_needs_clflush & CLFLUSH_BEFORE)
+                batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size);
        memcpy(dst, src, batch_len);
+        /* dst_obj is returned with vmap pinned */
+        *needs_clflush_after = dst_needs_clflush & CLFLUSH_AFTER;
 unmap_src:
-        vunmap(src_base);
+        i915_gem_object_unpin_map(src_obj);
+unpin_dst:
+        i915_gem_obj_finish_shmem_access(dst_obj);
 unpin_src:
        i915_gem_obj_finish_shmem_access(src_obj);
+        return dst;
-        return ret ? ERR_PTR(ret) : dst;
 }
 /**
@@ -1206,16 +1171,18 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
                            u32 batch_len,
                            bool is_master)
 {
-        u32 *cmd, *batch_base, *batch_end;
+        u32 *cmd, *batch_end;
        struct drm_i915_cmd_descriptor default_desc = { 0 };
        bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
+        bool needs_clflush_after = false;
        int ret = 0;
-        batch_base = copy_batch(shadow_batch_obj, batch_obj,
+        cmd = copy_batch(shadow_batch_obj, batch_obj,
-                                batch_start_offset, batch_len);
+                         batch_start_offset, batch_len,
-        if (IS_ERR(batch_base)) {
+                         &needs_clflush_after);
+        if (IS_ERR(cmd)) {
                DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
-                return PTR_ERR(batch_base);
+                return PTR_ERR(cmd);
        }
        /*
@@ -1223,9 +1190,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
         * large or larger and copy_batch() will write MI_NOPs to the extra
         * space. Parsing should be faster in some cases this way.
         */
-        batch_end = batch_base + (batch_len / sizeof(*batch_end));
+        batch_end = cmd + (batch_len / sizeof(*batch_end));
-        cmd = batch_base;
        while (cmd < batch_end) {
                const struct drm_i915_cmd_descriptor *desc;
                u32 length;
@@ -1284,7 +1249,9 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
                ret = -EINVAL;
        }
-        vunmap(batch_base);
+        if (ret == 0 && needs_clflush_after)
+                drm_clflush_virt_range(shadow_batch_obj->mapping, batch_len);
+        i915_gem_object_unpin_map(shadow_batch_obj);
        return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 907386630e26..4192066ff60e 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1512,7 +1512,7 @@ execbuf_submit(struct i915_execbuffer_params *params,
                     params->args_batch_start_offset;
        if (exec_len == 0)
-                exec_len = params->batch->size;
+                exec_len = params->batch->size - params->args_batch_start_offset;
        ret = params->engine->emit_bb_start(params->request,
                                            exec_start, exec_len,
@@ -1738,6 +1738,12 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
                ret = -EINVAL;
                goto err;
        }
+        if (args->batch_start_offset > params->batch->size ||
+            args->batch_len > params->batch->size - args->batch_start_offset) {
+                DRM_DEBUG("Attempting to use out-of-bounds batch\n");
+                ret = -EINVAL;
+                goto err;
+        }
        params->args_batch_start_offset = args->batch_start_offset;
        if (intel_engine_needs_cmd_parser(engine) && args->batch_len) {
author	Chris Wilson <chris@chris-wilson.co.uk>	2016-08-18 12:17:12 -0400
committer	Chris Wilson <chris@chris-wilson.co.uk>	2016-08-18 17:36:59 -0400
commit	0b5372727be37944239100ff05a63df9771c8484 (patch)
tree	edbad8a45469452c799f030ff601073b7ee9c346 /drivers/gpu
parent	068715b922a6f87c454cdfa15bb8049d2076eee6 (diff)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 8ebc0ce44a76..5d9ea163d1c8 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -937,98 +937,63 @@ find_reg_in_tables(const struct drm_i915_reg_table *tables,
937	return NULL;	937	return NULL;
938	}	938	}
939		939
940	static u32 vmap_batch(struct drm_i915_gem_object obj,	940	/* Returns a vmap'd pointer to dst_obj, which the caller must unmap */
941	unsigned start, unsigned len)	941	static u32 copy_batch(struct drm_i915_gem_object dst_obj,
942	{
943	int i;
944	void *addr = NULL;
945	struct sg_page_iter sg_iter;
946	int first_page = start >> PAGE_SHIFT;
947	int last_page = (len + start + 4095) >> PAGE_SHIFT;
948	int npages = last_page - first_page;
949	struct page **pages;
950
951	pages = drm_malloc_ab(npages, sizeof(*pages));
952	if (pages == NULL) {
953	DRM_DEBUG_DRIVER("Failed to get space for pages\n");
954	goto finish;
955	}
956
957	i = 0;
958	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
959	pages[i++] = sg_page_iter_page(&sg_iter);
960	if (i == npages)
961	break;
962	}
963
964	addr = vmap(pages, i, 0, PAGE_KERNEL);
965	if (addr == NULL) {
966	DRM_DEBUG_DRIVER("Failed to vmap pages\n");
967	goto finish;
968	}
969
970	finish:
971	if (pages)
972	drm_free_large(pages);
973	return (u32*)addr;
974	}
975
976	/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
977	static u32 copy_batch(struct drm_i915_gem_object dest_obj,
978	struct drm_i915_gem_object *src_obj,	942	struct drm_i915_gem_object *src_obj,
979	u32 batch_start_offset,	943	u32 batch_start_offset,
980	u32 batch_len)	944	u32 batch_len,
		945	bool *needs_clflush_after)
981	{	946	{
982	unsigned int needs_clflush;	947	unsigned int src_needs_clflush;
983	void src_base, src;	948	unsigned int dst_needs_clflush;
984	void *dst = NULL;	949	void src, dst;
985	int ret;	950	int ret;
986		951
987	if (batch_len > dest_obj->base.size \|\|	952	ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush);
988	batch_len + batch_start_offset > src_obj->base.size)	953	if (ret)
989	return ERR_PTR(-E2BIG);
990
991	if (WARN_ON(dest_obj->pages_pin_count == 0))
992	return ERR_PTR(-ENODEV);
993
994	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
995	if (ret) {
996	DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
997	return ERR_PTR(ret);	954	return ERR_PTR(ret);
998	}
999		955
1000	src_base = vmap_batch(src_obj, batch_start_offset, batch_len);	956	ret = i915_gem_obj_prepare_shmem_write(dst_obj, &dst_needs_clflush);
1001	if (!src_base) {	957	if (ret) {
1002	DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");	958	dst = ERR_PTR(ret);
1003	ret = -ENOMEM;
1004	goto unpin_src;	959	goto unpin_src;
1005	}	960	}
1006		961
1007	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);	962	src = i915_gem_object_pin_map(src_obj, I915_MAP_WB);
1008	if (ret) {	963	if (IS_ERR(src)) {
1009	DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");	964	dst = src;
1010	goto unmap_src;	965	goto unpin_dst;
1011	}	966	}
1012		967
1013	dst = vmap_batch(dest_obj, 0, batch_len);	968	dst = i915_gem_object_pin_map(dst_obj, I915_MAP_WB);
1014	if (!dst) {	969	if (IS_ERR(dst))
1015	DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
1016	ret = -ENOMEM;
1017	goto unmap_src;	970	goto unmap_src;
1018	}
1019		971
1020	src = src_base + offset_in_page(batch_start_offset);	972	src += batch_start_offset;
1021	if (needs_clflush)	973	if (src_needs_clflush)
1022	drm_clflush_virt_range(src, batch_len);	974	drm_clflush_virt_range(src, batch_len);
1023		975
		976	/* We can avoid clflushing partial cachelines before the write if we
		977	* only every write full cache-lines. Since we know that both the
		978	* source and destination are in multiples of PAGE_SIZE, we can simply
		979	* round up to the next cacheline. We don't care about copying too much
		980	* here as we only validate up to the end of the batch.
		981	*/
		982	if (dst_needs_clflush & CLFLUSH_BEFORE)
		983	batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size);
		984
1024	memcpy(dst, src, batch_len);	985	memcpy(dst, src, batch_len);
1025		986
		987	/* dst_obj is returned with vmap pinned */
		988	*needs_clflush_after = dst_needs_clflush & CLFLUSH_AFTER;
		989
1026	unmap_src:	990	unmap_src:
1027	vunmap(src_base);	991	i915_gem_object_unpin_map(src_obj);
		992	unpin_dst:
		993	i915_gem_obj_finish_shmem_access(dst_obj);
1028	unpin_src:	994	unpin_src:
1029	i915_gem_obj_finish_shmem_access(src_obj);	995	i915_gem_obj_finish_shmem_access(src_obj);
1030		996	return dst;
1031	return ret ? ERR_PTR(ret) : dst;
1032	}	997	}
1033		998
1034	/**	999	/**
@@ -1206,16 +1171,18 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
1206	u32 batch_len,	1171	u32 batch_len,
1207	bool is_master)	1172	bool is_master)
1208	{	1173	{
1209	u32 cmd, batch_base, *batch_end;	1174	u32 cmd, batch_end;
1210	struct drm_i915_cmd_descriptor default_desc = { 0 };	1175	struct drm_i915_cmd_descriptor default_desc = { 0 };
1211	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */	1176	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
		1177	bool needs_clflush_after = false;
1212	int ret = 0;	1178	int ret = 0;
1213		1179
1214	batch_base = copy_batch(shadow_batch_obj, batch_obj,	1180	cmd = copy_batch(shadow_batch_obj, batch_obj,
1215	batch_start_offset, batch_len);	1181	batch_start_offset, batch_len,
1216	if (IS_ERR(batch_base)) {	1182	&needs_clflush_after);
		1183	if (IS_ERR(cmd)) {
1217	DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");	1184	DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
1218	return PTR_ERR(batch_base);	1185	return PTR_ERR(cmd);
1219	}	1186	}
1220		1187
1221	/*	1188	/*
@@ -1223,9 +1190,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
1223	* large or larger and copy_batch() will write MI_NOPs to the extra	1190	* large or larger and copy_batch() will write MI_NOPs to the extra
1224	* space. Parsing should be faster in some cases this way.	1191	* space. Parsing should be faster in some cases this way.
1225	*/	1192	*/
1226	batch_end = batch_base + (batch_len / sizeof(*batch_end));	1193	batch_end = cmd + (batch_len / sizeof(*batch_end));
1227
1228	cmd = batch_base;
1229	while (cmd < batch_end) {	1194	while (cmd < batch_end) {
1230	const struct drm_i915_cmd_descriptor *desc;	1195	const struct drm_i915_cmd_descriptor *desc;
1231	u32 length;	1196	u32 length;
@@ -1284,7 +1249,9 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
1284	ret = -EINVAL;	1249	ret = -EINVAL;
1285	}	1250	}
1286		1251
1287	vunmap(batch_base);	1252	if (ret == 0 && needs_clflush_after)
		1253	drm_clflush_virt_range(shadow_batch_obj->mapping, batch_len);
		1254	i915_gem_object_unpin_map(shadow_batch_obj);
1288		1255
1289	return ret;	1256	return ret;
1290	}	1257	}


diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 907386630e26..4192066ff60e 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1512,7 +1512,7 @@ execbuf_submit(struct i915_execbuffer_params *params,
1512	params->args_batch_start_offset;	1512	params->args_batch_start_offset;
1513		1513
1514	if (exec_len == 0)	1514	if (exec_len == 0)
1515	exec_len = params->batch->size;	1515	exec_len = params->batch->size - params->args_batch_start_offset;
1516		1516
1517	ret = params->engine->emit_bb_start(params->request,	1517	ret = params->engine->emit_bb_start(params->request,
1518	exec_start, exec_len,	1518	exec_start, exec_len,
@@ -1738,6 +1738,12 @@ i915_gem_do_execbuffer(struct drm_device dev, void data,
1738	ret = -EINVAL;	1738	ret = -EINVAL;
1739	goto err;	1739	goto err;
1740	}	1740	}
		1741	if (args->batch_start_offset > params->batch->size \|\|
		1742	args->batch_len > params->batch->size - args->batch_start_offset) {
		1743	DRM_DEBUG("Attempting to use out-of-bounds batch\n");
		1744	ret = -EINVAL;
		1745	goto err;
		1746	}
1741		1747
1742	params->args_batch_start_offset = args->batch_start_offset;	1748	params->args_batch_start_offset = args->batch_start_offset;
1743	if (intel_engine_needs_cmd_parser(engine) && args->batch_len) {	1749	if (intel_engine_needs_cmd_parser(engine) && args->batch_len) {