aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2016-08-18 12:17:12 -0400
committerChris Wilson <chris@chris-wilson.co.uk>2016-08-18 17:36:59 -0400
commit0b5372727be37944239100ff05a63df9771c8484 (patch)
treeedbad8a45469452c799f030ff601073b7ee9c346 /drivers/gpu
parent068715b922a6f87c454cdfa15bb8049d2076eee6 (diff)
drm/i915/cmdparser: Use cached vmappings
The single largest factor in the overhead of parsing the commands is the setup of the virtual mapping to provide a continuous block for the batch buffer. If we keep those vmappings around (against the better judgement of mm/vmalloc.c, which we offset by handwaving and looking suggestively at the shrinker) we can dramatically improve the performance of the parser for small batches (such as media workloads). Furthermore, we can use the prepare shmem read/write functions to determine how best we need to clflush the range (rather than every page of the object). The impact of caching both src/dst vmaps is +80% on ivb and +140% on byt for the throughput on small batches. (Caching just the dst vmap and iterating over the src, doing a page by page copy is roughly 5% slower on both platforms. That may be an acceptable trade-off to eliminate one cached vmapping, and we may be able to reduce the per-page copying overhead further.) For *this* simple test case, the cmdparser is now within a factor of 2 of ideal performance. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.william.auld@gmail.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-33-chris@chris-wilson.co.uk
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/drm/i915/i915_cmd_parser.c127
-rw-r--r--drivers/gpu/drm/i915/i915_gem_execbuffer.c8
2 files changed, 54 insertions, 81 deletions
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 8ebc0ce44a76..5d9ea163d1c8 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -937,98 +937,63 @@ find_reg_in_tables(const struct drm_i915_reg_table *tables,
937 return NULL; 937 return NULL;
938} 938}
939 939
940static u32 *vmap_batch(struct drm_i915_gem_object *obj, 940/* Returns a vmap'd pointer to dst_obj, which the caller must unmap */
941 unsigned start, unsigned len) 941static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
942{
943 int i;
944 void *addr = NULL;
945 struct sg_page_iter sg_iter;
946 int first_page = start >> PAGE_SHIFT;
947 int last_page = (len + start + 4095) >> PAGE_SHIFT;
948 int npages = last_page - first_page;
949 struct page **pages;
950
951 pages = drm_malloc_ab(npages, sizeof(*pages));
952 if (pages == NULL) {
953 DRM_DEBUG_DRIVER("Failed to get space for pages\n");
954 goto finish;
955 }
956
957 i = 0;
958 for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
959 pages[i++] = sg_page_iter_page(&sg_iter);
960 if (i == npages)
961 break;
962 }
963
964 addr = vmap(pages, i, 0, PAGE_KERNEL);
965 if (addr == NULL) {
966 DRM_DEBUG_DRIVER("Failed to vmap pages\n");
967 goto finish;
968 }
969
970finish:
971 if (pages)
972 drm_free_large(pages);
973 return (u32*)addr;
974}
975
976/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
977static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
978 struct drm_i915_gem_object *src_obj, 942 struct drm_i915_gem_object *src_obj,
979 u32 batch_start_offset, 943 u32 batch_start_offset,
980 u32 batch_len) 944 u32 batch_len,
945 bool *needs_clflush_after)
981{ 946{
982 unsigned int needs_clflush; 947 unsigned int src_needs_clflush;
983 void *src_base, *src; 948 unsigned int dst_needs_clflush;
984 void *dst = NULL; 949 void *src, *dst;
985 int ret; 950 int ret;
986 951
987 if (batch_len > dest_obj->base.size || 952 ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush);
988 batch_len + batch_start_offset > src_obj->base.size) 953 if (ret)
989 return ERR_PTR(-E2BIG);
990
991 if (WARN_ON(dest_obj->pages_pin_count == 0))
992 return ERR_PTR(-ENODEV);
993
994 ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
995 if (ret) {
996 DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
997 return ERR_PTR(ret); 954 return ERR_PTR(ret);
998 }
999 955
1000 src_base = vmap_batch(src_obj, batch_start_offset, batch_len); 956 ret = i915_gem_obj_prepare_shmem_write(dst_obj, &dst_needs_clflush);
1001 if (!src_base) { 957 if (ret) {
1002 DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n"); 958 dst = ERR_PTR(ret);
1003 ret = -ENOMEM;
1004 goto unpin_src; 959 goto unpin_src;
1005 } 960 }
1006 961
1007 ret = i915_gem_object_set_to_cpu_domain(dest_obj, true); 962 src = i915_gem_object_pin_map(src_obj, I915_MAP_WB);
1008 if (ret) { 963 if (IS_ERR(src)) {
1009 DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n"); 964 dst = src;
1010 goto unmap_src; 965 goto unpin_dst;
1011 } 966 }
1012 967
1013 dst = vmap_batch(dest_obj, 0, batch_len); 968 dst = i915_gem_object_pin_map(dst_obj, I915_MAP_WB);
1014 if (!dst) { 969 if (IS_ERR(dst))
1015 DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
1016 ret = -ENOMEM;
1017 goto unmap_src; 970 goto unmap_src;
1018 }
1019 971
1020 src = src_base + offset_in_page(batch_start_offset); 972 src += batch_start_offset;
1021 if (needs_clflush) 973 if (src_needs_clflush)
1022 drm_clflush_virt_range(src, batch_len); 974 drm_clflush_virt_range(src, batch_len);
1023 975
976 /* We can avoid clflushing partial cachelines before the write if we
977 * only every write full cache-lines. Since we know that both the
978 * source and destination are in multiples of PAGE_SIZE, we can simply
979 * round up to the next cacheline. We don't care about copying too much
980 * here as we only validate up to the end of the batch.
981 */
982 if (dst_needs_clflush & CLFLUSH_BEFORE)
983 batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size);
984
1024 memcpy(dst, src, batch_len); 985 memcpy(dst, src, batch_len);
1025 986
987 /* dst_obj is returned with vmap pinned */
988 *needs_clflush_after = dst_needs_clflush & CLFLUSH_AFTER;
989
1026unmap_src: 990unmap_src:
1027 vunmap(src_base); 991 i915_gem_object_unpin_map(src_obj);
992unpin_dst:
993 i915_gem_obj_finish_shmem_access(dst_obj);
1028unpin_src: 994unpin_src:
1029 i915_gem_obj_finish_shmem_access(src_obj); 995 i915_gem_obj_finish_shmem_access(src_obj);
1030 996 return dst;
1031 return ret ? ERR_PTR(ret) : dst;
1032} 997}
1033 998
1034/** 999/**
@@ -1206,16 +1171,18 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
1206 u32 batch_len, 1171 u32 batch_len,
1207 bool is_master) 1172 bool is_master)
1208{ 1173{
1209 u32 *cmd, *batch_base, *batch_end; 1174 u32 *cmd, *batch_end;
1210 struct drm_i915_cmd_descriptor default_desc = { 0 }; 1175 struct drm_i915_cmd_descriptor default_desc = { 0 };
1211 bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */ 1176 bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
1177 bool needs_clflush_after = false;
1212 int ret = 0; 1178 int ret = 0;
1213 1179
1214 batch_base = copy_batch(shadow_batch_obj, batch_obj, 1180 cmd = copy_batch(shadow_batch_obj, batch_obj,
1215 batch_start_offset, batch_len); 1181 batch_start_offset, batch_len,
1216 if (IS_ERR(batch_base)) { 1182 &needs_clflush_after);
1183 if (IS_ERR(cmd)) {
1217 DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n"); 1184 DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
1218 return PTR_ERR(batch_base); 1185 return PTR_ERR(cmd);
1219 } 1186 }
1220 1187
1221 /* 1188 /*
@@ -1223,9 +1190,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
1223 * large or larger and copy_batch() will write MI_NOPs to the extra 1190 * large or larger and copy_batch() will write MI_NOPs to the extra
1224 * space. Parsing should be faster in some cases this way. 1191 * space. Parsing should be faster in some cases this way.
1225 */ 1192 */
1226 batch_end = batch_base + (batch_len / sizeof(*batch_end)); 1193 batch_end = cmd + (batch_len / sizeof(*batch_end));
1227
1228 cmd = batch_base;
1229 while (cmd < batch_end) { 1194 while (cmd < batch_end) {
1230 const struct drm_i915_cmd_descriptor *desc; 1195 const struct drm_i915_cmd_descriptor *desc;
1231 u32 length; 1196 u32 length;
@@ -1284,7 +1249,9 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
1284 ret = -EINVAL; 1249 ret = -EINVAL;
1285 } 1250 }
1286 1251
1287 vunmap(batch_base); 1252 if (ret == 0 && needs_clflush_after)
1253 drm_clflush_virt_range(shadow_batch_obj->mapping, batch_len);
1254 i915_gem_object_unpin_map(shadow_batch_obj);
1288 1255
1289 return ret; 1256 return ret;
1290} 1257}
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 907386630e26..4192066ff60e 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1512,7 +1512,7 @@ execbuf_submit(struct i915_execbuffer_params *params,
1512 params->args_batch_start_offset; 1512 params->args_batch_start_offset;
1513 1513
1514 if (exec_len == 0) 1514 if (exec_len == 0)
1515 exec_len = params->batch->size; 1515 exec_len = params->batch->size - params->args_batch_start_offset;
1516 1516
1517 ret = params->engine->emit_bb_start(params->request, 1517 ret = params->engine->emit_bb_start(params->request,
1518 exec_start, exec_len, 1518 exec_start, exec_len,
@@ -1738,6 +1738,12 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
1738 ret = -EINVAL; 1738 ret = -EINVAL;
1739 goto err; 1739 goto err;
1740 } 1740 }
1741 if (args->batch_start_offset > params->batch->size ||
1742 args->batch_len > params->batch->size - args->batch_start_offset) {
1743 DRM_DEBUG("Attempting to use out-of-bounds batch\n");
1744 ret = -EINVAL;
1745 goto err;
1746 }
1741 1747
1742 params->args_batch_start_offset = args->batch_start_offset; 1748 params->args_batch_start_offset = args->batch_start_offset;
1743 if (intel_engine_needs_cmd_parser(engine) && args->batch_len) { 1749 if (intel_engine_needs_cmd_parser(engine) && args->batch_len) {