diff options
author | Jussi Rasanen <jrasanen@nvidia.com> | 2014-10-03 05:44:05 -0400 |
---|---|---|
committer | Dan Willemsen <dwillemsen@nvidia.com> | 2015-03-18 15:12:06 -0400 |
commit | 529962911c2e9b5c4e3a95b6c78dba8f15447a93 (patch) | |
tree | e439afdb7203e0810e543711b4333ede8f002b31 /drivers/gpu/nvgpu | |
parent | 6e22f39e8747a8ab9c720ef2e5236e5c94767f88 (diff) |
gpu: nvgpu: cde: Combine H and V passes
When using CDE firmware v1, combine H and V swizzling passes into one
pushbuffer submission. This removes one GPU context switch, almost
halving the time taken for swizzling.
Map only the compbit part of the destination surface.
Bug 1546619
Change-Id: I95ed4e4c2eefd6d24a58854d31929cdb91ff556b
Signed-off-by: Jussi Rasanen <jrasanen@nvidia.com>
Reviewed-on: http://git-master/r/553234
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/cde_gk20a.c | 541 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/cde_gk20a.h | 45 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 2 |
4 files changed, 390 insertions, 200 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c index 472cc81c..8b2ed55e 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c | |||
@@ -337,8 +337,8 @@ static int gk20a_replace_data(struct gk20a_cde_ctx *cde_ctx, void *target, | |||
337 | } | 337 | } |
338 | 338 | ||
339 | static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx, | 339 | static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx, |
340 | const struct firmware *img, | 340 | const struct firmware *img, |
341 | struct gk20a_cde_hdr_replace *replace) | 341 | struct gk20a_cde_hdr_replace *replace) |
342 | { | 342 | { |
343 | struct gk20a_cde_mem_desc *source_mem; | 343 | struct gk20a_cde_mem_desc *source_mem; |
344 | struct gk20a_cde_mem_desc *target_mem; | 344 | struct gk20a_cde_mem_desc *target_mem; |
@@ -410,26 +410,26 @@ static int gk20a_cde_patch_params(struct gk20a_cde_ctx *cde_ctx) | |||
410 | g->gr.cacheline_size; | 410 | g->gr.cacheline_size; |
411 | break; | 411 | break; |
412 | case TYPE_PARAM_FIRSTPAGEOFFSET: | 412 | case TYPE_PARAM_FIRSTPAGEOFFSET: |
413 | new_data = cde_ctx->src_param_offset; | 413 | new_data = cde_ctx->surf_param_offset; |
414 | break; | 414 | break; |
415 | case TYPE_PARAM_NUMPAGES: | 415 | case TYPE_PARAM_NUMPAGES: |
416 | new_data = cde_ctx->src_param_lines; | 416 | new_data = cde_ctx->surf_param_lines; |
417 | break; | 417 | break; |
418 | case TYPE_PARAM_BACKINGSTORE: | 418 | case TYPE_PARAM_BACKINGSTORE: |
419 | new_data = cde_ctx->backing_store_vaddr; | 419 | new_data = cde_ctx->backing_store_vaddr; |
420 | break; | 420 | break; |
421 | case TYPE_PARAM_DESTINATION: | 421 | case TYPE_PARAM_DESTINATION: |
422 | new_data = cde_ctx->dest_vaddr; | 422 | new_data = cde_ctx->compbit_vaddr; |
423 | break; | 423 | break; |
424 | case TYPE_PARAM_DESTINATION_SIZE: | 424 | case TYPE_PARAM_DESTINATION_SIZE: |
425 | new_data = cde_ctx->dest_size; | 425 | new_data = cde_ctx->compbit_size; |
426 | break; | 426 | break; |
427 | case TYPE_PARAM_BACKINGSTORE_SIZE: | 427 | case TYPE_PARAM_BACKINGSTORE_SIZE: |
428 | new_data = g->gr.compbit_store.size; | 428 | new_data = g->gr.compbit_store.size; |
429 | break; | 429 | break; |
430 | case TYPE_PARAM_SOURCE_SMMU_ADDR: | 430 | case TYPE_PARAM_SOURCE_SMMU_ADDR: |
431 | new_data = gk20a_mm_gpuva_to_iova(cde_ctx->vm, | 431 | new_data = gk20a_mm_gpuva_to_iova_base(cde_ctx->vm, |
432 | cde_ctx->src_vaddr); | 432 | cde_ctx->surf_vaddr); |
433 | if (new_data == 0) | 433 | if (new_data == 0) |
434 | err = -EINVAL; | 434 | err = -EINVAL; |
435 | break; | 435 | break; |
@@ -605,8 +605,9 @@ static int gk20a_init_cde_command(struct gk20a_cde_ctx *cde_ctx, | |||
605 | static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx, | 605 | static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx, |
606 | const struct firmware *img) | 606 | const struct firmware *img) |
607 | { | 607 | { |
608 | struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app; | ||
608 | u32 *data = (u32 *)img->data; | 609 | u32 *data = (u32 *)img->data; |
609 | u32 version, num_of_elems; | 610 | u32 num_of_elems; |
610 | struct gk20a_cde_hdr_elem *elem; | 611 | struct gk20a_cde_hdr_elem *elem; |
611 | u32 min_size = 0; | 612 | u32 min_size = 0; |
612 | int err = 0; | 613 | int err = 0; |
@@ -618,7 +619,7 @@ static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx, | |||
618 | return -EINVAL; | 619 | return -EINVAL; |
619 | } | 620 | } |
620 | 621 | ||
621 | version = data[0]; | 622 | cde_app->firmware_version = data[0]; |
622 | num_of_elems = data[1]; | 623 | num_of_elems = data[1]; |
623 | 624 | ||
624 | min_size += num_of_elems * sizeof(*elem); | 625 | min_size += num_of_elems * sizeof(*elem); |
@@ -654,6 +655,11 @@ static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx, | |||
654 | elem->command.num_entries); | 655 | elem->command.num_entries); |
655 | break; | 656 | break; |
656 | } | 657 | } |
658 | case TYPE_ARRAY: | ||
659 | memcpy(&cde_app->arrays[elem->array.id][0], | ||
660 | elem->array.data, | ||
661 | MAX_CDE_ARRAY_ENTRIES*sizeof(u32)); | ||
662 | break; | ||
657 | default: | 663 | default: |
658 | gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown header element"); | 664 | gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown header element"); |
659 | err = -EINVAL; | 665 | err = -EINVAL; |
@@ -853,27 +859,25 @@ static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g) | |||
853 | } | 859 | } |
854 | 860 | ||
855 | int gk20a_cde_convert(struct gk20a *g, | 861 | int gk20a_cde_convert(struct gk20a *g, |
856 | struct dma_buf *dst, | 862 | struct dma_buf *compbits_buf, |
857 | s32 dst_kind, u64 dst_byte_offset, | 863 | s32 compbits_kind, u64 compbits_byte_offset, |
858 | u32 dst_size, struct nvgpu_fence *fence, | 864 | u32 compbits_size, struct nvgpu_fence *fence, |
859 | u32 __flags, struct gk20a_cde_param *params, | 865 | u32 __flags, struct gk20a_cde_param *params, |
860 | int num_params, struct gk20a_fence **fence_out) | 866 | int num_params, struct gk20a_fence **fence_out) |
861 | __acquires(&cde_app->mutex) | 867 | __acquires(&cde_app->mutex) |
862 | __releases(&cde_app->mutex) | 868 | __releases(&cde_app->mutex) |
863 | { | 869 | { |
864 | struct gk20a_cde_app *cde_app = &g->cde_app; | 870 | struct gk20a_cde_ctx *cde_ctx = NULL; |
865 | struct gk20a_comptags comptags; | 871 | struct gk20a_comptags comptags; |
866 | struct gk20a_cde_ctx *cde_ctx; | 872 | u64 compbits_offset = 0; |
867 | u64 dst_vaddr = 0; | 873 | u64 map_vaddr = 0; |
874 | u64 map_offset = 0; | ||
875 | u32 map_size = 0; | ||
876 | u64 big_page_mask = 0; | ||
868 | u32 flags; | 877 | u32 flags; |
869 | int err, i; | 878 | int err, i; |
870 | 879 | ||
871 | if (!cde_app->initialised) { | 880 | mutex_lock(&g->cde_app.mutex); |
872 | gk20a_warn(&g->dev->dev, "cde: conversion requrest but no image has been provided"); | ||
873 | return -ENOSYS; | ||
874 | } | ||
875 | |||
876 | mutex_lock(&cde_app->mutex); | ||
877 | 881 | ||
878 | cde_ctx = gk20a_cde_get_context(g); | 882 | cde_ctx = gk20a_cde_get_context(g); |
879 | if (IS_ERR(cde_ctx)) { | 883 | if (IS_ERR(cde_ctx)) { |
@@ -881,38 +885,53 @@ __releases(&cde_app->mutex) | |||
881 | goto exit_unlock; | 885 | goto exit_unlock; |
882 | } | 886 | } |
883 | 887 | ||
884 | /* First, map the buffers to local va */ | 888 | /* First, map the buffer to local va */ |
885 | 889 | ||
886 | /* ensure that the dst buffer has drvdata */ | 890 | /* ensure that the compbits buffer has drvdata */ |
887 | err = gk20a_dmabuf_alloc_drvdata(dst, &g->dev->dev); | 891 | err = gk20a_dmabuf_alloc_drvdata(compbits_buf, &g->dev->dev); |
888 | if (err) | 892 | if (err) |
889 | goto exit_unlock; | 893 | goto exit_unlock; |
890 | 894 | ||
895 | /* compbits don't start at page aligned offset, so we need to align | ||
896 | the region to be mapped */ | ||
897 | big_page_mask = cde_ctx->vm->big_page_size - 1; | ||
898 | map_offset = compbits_byte_offset & ~big_page_mask; | ||
899 | |||
900 | /* compute compbit start offset from the beginning of the mapped | ||
901 | area */ | ||
902 | compbits_offset = compbits_byte_offset & big_page_mask; | ||
903 | |||
904 | if (!compbits_size) { | ||
905 | compbits_size = compbits_buf->size - compbits_byte_offset; | ||
906 | map_size = compbits_buf->size - map_offset; | ||
907 | } | ||
908 | |||
891 | /* map the destination buffer */ | 909 | /* map the destination buffer */ |
892 | get_dma_buf(dst); /* a ref for gk20a_vm_map */ | 910 | get_dma_buf(compbits_buf); /* a ref for gk20a_vm_map */ |
893 | dst_vaddr = gk20a_vm_map(cde_ctx->vm, dst, 0, | 911 | map_vaddr = gk20a_vm_map(cde_ctx->vm, compbits_buf, 0, |
894 | NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, | 912 | NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, |
895 | dst_kind, NULL, true, | 913 | compbits_kind, NULL, true, |
896 | gk20a_mem_flag_none, | 914 | gk20a_mem_flag_none, |
897 | 0, 0); | 915 | map_offset, map_size); |
898 | if (!dst_vaddr) { | 916 | if (!map_vaddr) { |
899 | dma_buf_put(dst); | 917 | dma_buf_put(compbits_buf); |
900 | err = -EINVAL; | 918 | err = -EINVAL; |
901 | goto exit_unlock; | 919 | goto exit_unlock; |
902 | } | 920 | } |
903 | 921 | ||
904 | if (!dst_size) | ||
905 | dst_size = dst->size - dst_byte_offset; | ||
906 | |||
907 | /* store source buffer compression tags */ | 922 | /* store source buffer compression tags */ |
908 | gk20a_get_comptags(&g->dev->dev, dst, &comptags); | 923 | gk20a_get_comptags(&g->dev->dev, compbits_buf, &comptags); |
909 | cde_ctx->src_vaddr = dst_vaddr; | 924 | cde_ctx->surf_param_offset = comptags.offset; |
910 | cde_ctx->src_param_offset = comptags.offset; | 925 | cde_ctx->surf_param_lines = comptags.lines; |
911 | cde_ctx->src_param_lines = comptags.lines; | 926 | |
927 | /* store surface vaddr. This is actually compbit vaddr, but since | ||
928 | compbits live in the same surface, and we can get the alloc base | ||
929 | address by using gk20a_mm_gpuva_to_iova_base, this will do */ | ||
930 | cde_ctx->surf_vaddr = map_vaddr; | ||
912 | 931 | ||
913 | /* store information about destination */ | 932 | /* store information about destination */ |
914 | cde_ctx->dest_vaddr = dst_vaddr + dst_byte_offset; | 933 | cde_ctx->compbit_vaddr = map_vaddr + compbits_offset; |
915 | cde_ctx->dest_size = dst_size; | 934 | cde_ctx->compbit_size = compbits_size; |
916 | 935 | ||
917 | /* remove existing argument data */ | 936 | /* remove existing argument data */ |
918 | memset(cde_ctx->user_param_values, 0, | 937 | memset(cde_ctx->user_param_values, 0, |
@@ -940,8 +959,8 @@ __releases(&cde_app->mutex) | |||
940 | 959 | ||
941 | gk20a_dbg(gpu_dbg_cde, "cde: buffer=cbc, size=%zu, gpuva=%llx\n", | 960 | gk20a_dbg(gpu_dbg_cde, "cde: buffer=cbc, size=%zu, gpuva=%llx\n", |
942 | g->gr.compbit_store.size, cde_ctx->backing_store_vaddr); | 961 | g->gr.compbit_store.size, cde_ctx->backing_store_vaddr); |
943 | gk20a_dbg(gpu_dbg_cde, "cde: buffer=dst, size=%llu, gpuva=%llx\n", | 962 | gk20a_dbg(gpu_dbg_cde, "cde: buffer=compbits, size=%llu, gpuva=%llx\n", |
944 | cde_ctx->dest_size, cde_ctx->dest_vaddr); | 963 | cde_ctx->compbit_size, cde_ctx->compbit_vaddr); |
945 | 964 | ||
946 | /* execute the init push buffer */ | 965 | /* execute the init push buffer */ |
947 | if (!cde_ctx->init_cmd_executed) { | 966 | if (!cde_ctx->init_cmd_executed) { |
@@ -964,11 +983,10 @@ __releases(&cde_app->mutex) | |||
964 | exit_unlock: | 983 | exit_unlock: |
965 | 984 | ||
966 | /* unmap the buffers - channel holds references to them now */ | 985 | /* unmap the buffers - channel holds references to them now */ |
967 | if (dst_vaddr) | 986 | if (map_vaddr) |
968 | gk20a_vm_unmap(cde_ctx->vm, dst_vaddr); | 987 | gk20a_vm_unmap(cde_ctx->vm, map_vaddr); |
969 | |||
970 | mutex_unlock(&cde_app->mutex); | ||
971 | 988 | ||
989 | mutex_unlock(&g->cde_app.mutex); | ||
972 | return err; | 990 | return err; |
973 | } | 991 | } |
974 | 992 | ||
@@ -1159,152 +1177,322 @@ __releases(&cde_app->mutex) | |||
1159 | return err; | 1177 | return err; |
1160 | } | 1178 | } |
1161 | 1179 | ||
1162 | enum cde_launch_patch_offset { | ||
1163 | /* dst buffer width in roptiles */ | ||
1164 | PATCH_USER_CONST_XTILES, | ||
1165 | /* dst buffer height in roptiles */ | ||
1166 | PATCH_USER_CONST_YTILES, | ||
1167 | /* dst buffer log2(block height) */ | ||
1168 | PATCH_USER_CONST_BLOCKHEIGHTLOG2, | ||
1169 | /* dst buffer pitch in bytes */ | ||
1170 | PATCH_USER_CONST_DSTPITCH, | ||
1171 | /* dst buffer write offset */ | ||
1172 | PATCH_USER_CONST_DSTOFFSET, | ||
1173 | /* comp cache index of the first page of the surface, | ||
1174 | * kernel looks it up from PTE */ | ||
1175 | PATCH_USER_CONST_FIRSTPAGEOFFSET, | ||
1176 | /* gmmu translated surface address, kernel fills */ | ||
1177 | PATCH_USER_CONST_SURFADDR, | ||
1178 | /* dst buffer address >> 8, kernel fills */ | ||
1179 | PATCH_VPC_DSTIMAGE_ADDR, | ||
1180 | /* dst buffer address >> 8, kernel fills */ | ||
1181 | PATCH_VPC_DSTIMAGE_ADDR2, | ||
1182 | /* dst buffer size - 1, kernel fills */ | ||
1183 | PATCH_VPC_DSTIMAGE_SIZE_MINUS_ONE, | ||
1184 | /* dst buffer size - 1, kernel fills */ | ||
1185 | PATCH_VPC_DSTIMAGE_SIZE_MINUS_ONE2, | ||
1186 | /* dst buffer size, kernel fills */ | ||
1187 | PATCH_VPC_DSTIMAGE_SIZE, | ||
1188 | /* dst buffer width in roptiles / work group width */ | ||
1189 | PATCH_VPC_CURRENT_GRID_SIZE_X, | ||
1190 | /* dst buffer height in roptiles / work group height */ | ||
1191 | PATCH_VPC_CURRENT_GRID_SIZE_Y, | ||
1192 | /* 1 */ | ||
1193 | PATCH_VPC_CURRENT_GRID_SIZE_Z, | ||
1194 | /* work group width, 16 seems to be quite optimal */ | ||
1195 | PATCH_VPC_CURRENT_GROUP_SIZE_X, | ||
1196 | /* work group height, 8 seems to be quite optimal */ | ||
1197 | PATCH_VPC_CURRENT_GROUP_SIZE_Y, | ||
1198 | /* 1 */ | ||
1199 | PATCH_VPC_CURRENT_GROUP_SIZE_Z, | ||
1200 | /* same as PATCH_VPC_CURRENT_GRID_SIZE_X */ | ||
1201 | PATCH_QMD_CTA_RASTER_WIDTH, | ||
1202 | /* same as PATCH_VPC_CURRENT_GRID_SIZE_Y */ | ||
1203 | PATCH_QMD_CTA_RASTER_HEIGHT, | ||
1204 | /* same as PATCH_VPC_CURRENT_GRID_SIZE_Z */ | ||
1205 | PATCH_QMD_CTA_RASTER_DEPTH, | ||
1206 | /* same as PATCH_VPC_CURRENT_GROUP_SIZE_X */ | ||
1207 | PATCH_QMD_CTA_THREAD_DIMENSION0, | ||
1208 | /* same as PATCH_VPC_CURRENT_GROUP_SIZE_Y */ | ||
1209 | PATCH_QMD_CTA_THREAD_DIMENSION1, | ||
1210 | /* same as PATCH_VPC_CURRENT_GROUP_SIZE_Z */ | ||
1211 | PATCH_QMD_CTA_THREAD_DIMENSION2, | ||
1212 | |||
1213 | NUM_CDE_LAUNCH_PATCHES | ||
1214 | }; | ||
1215 | |||
1216 | enum cde_launch_patch_id { | 1180 | enum cde_launch_patch_id { |
1217 | PATCH_QMD_CTA_RASTER_WIDTH_ID = 1024, | 1181 | PATCH_H_QMD_CTA_RASTER_WIDTH_ID = 1024, |
1218 | PATCH_QMD_CTA_RASTER_HEIGHT_ID = 1025, | 1182 | PATCH_H_QMD_CTA_RASTER_HEIGHT_ID = 1025, |
1219 | PATCH_QMD_CTA_RASTER_DEPTH_ID = 1026, | 1183 | PATCH_QMD_CTA_RASTER_DEPTH_ID = 1026, /* for firmware v0 only */ |
1220 | PATCH_QMD_CTA_THREAD_DIMENSION0_ID = 1027, | 1184 | PATCH_QMD_CTA_THREAD_DIMENSION0_ID = 1027, |
1221 | PATCH_QMD_CTA_THREAD_DIMENSION1_ID = 1028, | 1185 | PATCH_QMD_CTA_THREAD_DIMENSION1_ID = 1028, |
1222 | PATCH_QMD_CTA_THREAD_DIMENSION2_ID = 1029, | 1186 | PATCH_QMD_CTA_THREAD_DIMENSION2_ID = 1029, /* for firmware v0 only */ |
1223 | PATCH_USER_CONST_XTILES_ID = 1030, | 1187 | PATCH_USER_CONST_XTILES_ID = 1030, /* for firmware v0 only */ |
1224 | PATCH_USER_CONST_YTILES_ID = 1031, | 1188 | PATCH_USER_CONST_YTILES_ID = 1031, /* for firmware v0 only */ |
1225 | PATCH_USER_CONST_BLOCKHEIGHTLOG2_ID = 1032, | 1189 | PATCH_USER_CONST_BLOCKHEIGHTLOG2_ID = 1032, |
1226 | PATCH_USER_CONST_DSTPITCH_ID = 1033, | 1190 | PATCH_USER_CONST_DSTPITCH_ID = 1033, /* for firmware v0 only */ |
1227 | PATCH_USER_CONST_DSTOFFSET_ID = 1034, | 1191 | PATCH_H_USER_CONST_FLAGS_ID = 1034, /* for firmware v0 only */ |
1228 | PATCH_VPC_CURRENT_GRID_SIZE_X_ID = 1035, | 1192 | PATCH_H_VPC_CURRENT_GRID_SIZE_X_ID = 1035, |
1229 | PATCH_VPC_CURRENT_GRID_SIZE_Y_ID = 1036, | 1193 | PATCH_H_VPC_CURRENT_GRID_SIZE_Y_ID = 1036, |
1230 | PATCH_VPC_CURRENT_GRID_SIZE_Z_ID = 1037, | 1194 | PATCH_H_VPC_CURRENT_GRID_SIZE_Z_ID = 1037, |
1231 | PATCH_VPC_CURRENT_GROUP_SIZE_X_ID = 1038, | 1195 | PATCH_VPC_CURRENT_GROUP_SIZE_X_ID = 1038, |
1232 | PATCH_VPC_CURRENT_GROUP_SIZE_Y_ID = 1039, | 1196 | PATCH_VPC_CURRENT_GROUP_SIZE_Y_ID = 1039, |
1233 | PATCH_VPC_CURRENT_GROUP_SIZE_Z_ID = 1040, | 1197 | PATCH_VPC_CURRENT_GROUP_SIZE_Z_ID = 1040, |
1198 | PATCH_USER_CONST_XBLOCKS_ID = 1041, | ||
1199 | PATCH_H_USER_CONST_DSTOFFSET_ID = 1042, | ||
1200 | PATCH_V_QMD_CTA_RASTER_WIDTH_ID = 1043, | ||
1201 | PATCH_V_QMD_CTA_RASTER_HEIGHT_ID = 1044, | ||
1202 | PATCH_V_USER_CONST_DSTOFFSET_ID = 1045, | ||
1203 | PATCH_V_VPC_CURRENT_GRID_SIZE_X_ID = 1046, | ||
1204 | PATCH_V_VPC_CURRENT_GRID_SIZE_Y_ID = 1047, | ||
1205 | PATCH_V_VPC_CURRENT_GRID_SIZE_Z_ID = 1048, | ||
1206 | PATCH_H_LAUNCH_WORD1_ID = 1049, | ||
1207 | PATCH_H_LAUNCH_WORD2_ID = 1050, | ||
1208 | PATCH_V_LAUNCH_WORD1_ID = 1051, | ||
1209 | PATCH_V_LAUNCH_WORD2_ID = 1052, | ||
1210 | PATCH_H_QMD_PROGRAM_OFFSET_ID = 1053, | ||
1211 | PATCH_H_QMD_REGISTER_COUNT_ID = 1054, | ||
1212 | PATCH_V_QMD_PROGRAM_OFFSET_ID = 1055, | ||
1213 | PATCH_V_QMD_REGISTER_COUNT_ID = 1056, | ||
1234 | }; | 1214 | }; |
1235 | 1215 | ||
1236 | static int gk20a_buffer_convert_gpu_to_cde( | 1216 | enum programs { |
1237 | struct gk20a *g, struct dma_buf *dmabuf, u32 consumer, | 1217 | PROG_HPASS = 0, |
1238 | u64 offset, u64 compbits_offset, | 1218 | PROG_VPASS_LARGE = 1, |
1219 | PROG_VPASS_SMALL = 2, | ||
1220 | PROG_HPASS_DEBUG = 3, | ||
1221 | PROG_VPASS_LARGE_DEBUG = 4, | ||
1222 | PROG_VPASS_SMALL_DEBUG = 5, | ||
1223 | PROG_PASSTHROUGH = 6, | ||
1224 | NUM_PROGRAMS = 7 | ||
1225 | }; | ||
1226 | |||
1227 | /* maximum number of WRITE_PATCHes in the below function */ | ||
1228 | #define MAX_CDE_LAUNCH_PATCHES 32 | ||
1229 | |||
1230 | static int gk20a_buffer_convert_gpu_to_cde_v0( | ||
1231 | struct gk20a *g, | ||
1232 | struct dma_buf *dmabuf, u32 consumer, | ||
1233 | u64 offset, u64 compbits_hoffset, u64 compbits_voffset, | ||
1239 | u32 width, u32 height, u32 block_height_log2, | 1234 | u32 width, u32 height, u32 block_height_log2, |
1240 | u32 submit_flags, struct nvgpu_fence *fence_in, | 1235 | u32 submit_flags, struct nvgpu_fence *fence_in, |
1241 | struct gk20a_fence **fence_out) | 1236 | struct gk20a_buffer_state *state) |
1242 | { | 1237 | { |
1243 | struct gk20a_cde_param params[NUM_CDE_LAUNCH_PATCHES]; | 1238 | struct gk20a_cde_param params[MAX_CDE_LAUNCH_PATCHES]; |
1244 | int param = 0; | 1239 | int param = 0; |
1245 | int err = 0; | 1240 | int err = 0; |
1241 | struct gk20a_fence *new_fence = NULL; | ||
1242 | const int wgx = 8; | ||
1243 | const int wgy = 8; | ||
1244 | const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ | ||
1245 | const int xalign = compbits_per_byte * wgx; | ||
1246 | const int yalign = wgy; | ||
1246 | 1247 | ||
1247 | /* Compute per launch parameters */ | 1248 | /* firmware v0 needs to call swizzling twice */ |
1248 | const bool transpose = (consumer == NVGPU_GPU_COMPBITS_CDEV); | 1249 | int i; |
1249 | const int transposed_width = transpose ? height : width; | 1250 | for (i = 0; i < 2; i++) { |
1250 | const int transposed_height = transpose ? width : height; | 1251 | /* Compute per launch parameters */ |
1251 | const int xtiles = (transposed_width + 7) >> 3; | 1252 | const bool vpass = (i == 1); |
1252 | const int ytiles = (transposed_height + 7) >> 3; | 1253 | const int transposed_width = vpass ? height : width; |
1254 | const int transposed_height = vpass ? width : height; | ||
1255 | const int xtiles = (transposed_width + 7) >> 3; | ||
1256 | const int ytiles = (transposed_height + 7) >> 3; | ||
1257 | const int gridw = roundup(xtiles, xalign) / xalign; | ||
1258 | const int gridh = roundup(ytiles, yalign) / yalign; | ||
1259 | const int flags = (vpass ? 4 : 0) | | ||
1260 | g->cde_app.shader_parameter; | ||
1261 | const int dst_stride = 128; /* chip constant */ | ||
1262 | |||
1263 | if ((vpass && !(consumer & NVGPU_GPU_COMPBITS_CDEV)) || | ||
1264 | (!vpass && !(consumer & NVGPU_GPU_COMPBITS_CDEH))) | ||
1265 | continue; | ||
1266 | |||
1267 | if (xtiles > 4096 / 8 || ytiles > 4096 / 8) | ||
1268 | gk20a_warn(&g->dev->dev, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)", | ||
1269 | xtiles, ytiles); | ||
1270 | |||
1271 | gk20a_dbg(gpu_dbg_cde, "pass=%c", vpass ? 'V' : 'H'); | ||
1272 | gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_hoffset=0x%llx, compbits_voffset=0x%llx", | ||
1273 | width, height, block_height_log2, | ||
1274 | compbits_hoffset, compbits_voffset); | ||
1275 | gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d)", | ||
1276 | width, height, xtiles, ytiles); | ||
1277 | gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)", | ||
1278 | wgx, wgy, gridw, gridh); | ||
1279 | |||
1280 | /* Write parameters */ | ||
1281 | #define WRITE_PATCH(NAME, VALUE) \ | ||
1282 | params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE} | ||
1283 | param = 0; | ||
1284 | WRITE_PATCH(PATCH_USER_CONST_XTILES, xtiles); | ||
1285 | WRITE_PATCH(PATCH_USER_CONST_YTILES, ytiles); | ||
1286 | WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2, | ||
1287 | block_height_log2); | ||
1288 | WRITE_PATCH(PATCH_USER_CONST_DSTPITCH, dst_stride); | ||
1289 | WRITE_PATCH(PATCH_H_USER_CONST_FLAGS, flags); | ||
1290 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_X, gridw); | ||
1291 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Y, gridh); | ||
1292 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Z, 1); | ||
1293 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx); | ||
1294 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy); | ||
1295 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1); | ||
1296 | WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_WIDTH, gridw); | ||
1297 | WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_HEIGHT, gridh); | ||
1298 | WRITE_PATCH(PATCH_QMD_CTA_RASTER_DEPTH, 1); | ||
1299 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx); | ||
1300 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy); | ||
1301 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION2, 1); | ||
1302 | #undef WRITE_PATCH | ||
1303 | |||
1304 | err = gk20a_cde_convert(g, dmabuf, | ||
1305 | 0, /* dst kind */ | ||
1306 | vpass ? | ||
1307 | compbits_voffset : | ||
1308 | compbits_hoffset, | ||
1309 | 0, /* dst_size, 0 = auto */ | ||
1310 | fence_in, submit_flags, | ||
1311 | params, param, | ||
1312 | &new_fence); | ||
1313 | if (err) | ||
1314 | goto out; | ||
1315 | |||
1316 | /* compbits generated, update state & fence */ | ||
1317 | gk20a_fence_put(state->fence); | ||
1318 | state->fence = new_fence; | ||
1319 | state->valid_compbits |= vpass ? | ||
1320 | NVGPU_GPU_COMPBITS_CDEV : | ||
1321 | NVGPU_GPU_COMPBITS_CDEH; | ||
1322 | } | ||
1323 | out: | ||
1324 | return err; | ||
1325 | } | ||
1326 | |||
1327 | static int gk20a_buffer_convert_gpu_to_cde_v1( | ||
1328 | struct gk20a *g, | ||
1329 | struct dma_buf *dmabuf, u32 consumer, | ||
1330 | u64 offset, u64 compbits_hoffset, u64 compbits_voffset, | ||
1331 | u32 width, u32 height, u32 block_height_log2, | ||
1332 | u32 submit_flags, struct nvgpu_fence *fence_in, | ||
1333 | struct gk20a_buffer_state *state) | ||
1334 | { | ||
1335 | struct gk20a_cde_param params[MAX_CDE_LAUNCH_PATCHES]; | ||
1336 | int param = 0; | ||
1337 | int err = 0; | ||
1338 | struct gk20a_fence *new_fence = NULL; | ||
1253 | const int wgx = 8; | 1339 | const int wgx = 8; |
1254 | const int wgy = 8; | 1340 | const int wgy = 8; |
1255 | const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ | 1341 | const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ |
1256 | const int dst_stride = 128; /* TODO chip constant */ | ||
1257 | const int xalign = compbits_per_byte * wgx; | 1342 | const int xalign = compbits_per_byte * wgx; |
1258 | const int yalign = wgy; | 1343 | const int yalign = wgy; |
1259 | const int gridw = roundup(xtiles, xalign) / xalign; | ||
1260 | const int gridh = roundup(ytiles, yalign) / yalign; | ||
1261 | 1344 | ||
1262 | if (!g->cde_app.initialised) | 1345 | /* Compute per launch parameters */ |
1263 | return -ENOSYS; | 1346 | const int xtiles = (width + 7) >> 3; |
1347 | const int ytiles = (height + 7) >> 3; | ||
1348 | const int gridw_h = roundup(xtiles, xalign) / xalign; | ||
1349 | const int gridh_h = roundup(ytiles, yalign) / yalign; | ||
1350 | const int gridw_v = roundup(ytiles, xalign) / xalign; | ||
1351 | const int gridh_v = roundup(xtiles, yalign) / yalign; | ||
1352 | const int xblocks = (xtiles + 1) >> 1; | ||
1353 | const int voffset = compbits_voffset - compbits_hoffset; | ||
1354 | |||
1355 | int hprog = PROG_HPASS; | ||
1356 | int vprog = (block_height_log2 >= 2) ? | ||
1357 | PROG_VPASS_LARGE : PROG_VPASS_SMALL; | ||
1358 | if (g->cde_app.shader_parameter == 1) { | ||
1359 | hprog = PROG_PASSTHROUGH; | ||
1360 | vprog = PROG_PASSTHROUGH; | ||
1361 | } else if (g->cde_app.shader_parameter == 2) { | ||
1362 | hprog = PROG_HPASS_DEBUG; | ||
1363 | vprog = (block_height_log2 >= 2) ? | ||
1364 | PROG_VPASS_LARGE_DEBUG : | ||
1365 | PROG_VPASS_SMALL_DEBUG; | ||
1366 | } | ||
1264 | 1367 | ||
1265 | if (xtiles > 4096 / 8 || ytiles > 4096 / 8) | 1368 | if (xtiles > 4096 / 8 || ytiles > 4096 / 8) |
1266 | gk20a_warn(&g->dev->dev, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)", | 1369 | gk20a_warn(&g->dev->dev, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)", |
1267 | xtiles, ytiles); | 1370 | xtiles, ytiles); |
1268 | 1371 | ||
1269 | gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx", | 1372 | gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_hoffset=0x%llx, compbits_voffset=0x%llx", |
1270 | width, height, block_height_log2, compbits_offset); | 1373 | width, height, block_height_log2, |
1271 | gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)", | 1374 | compbits_hoffset, compbits_voffset); |
1272 | width, height, xtiles, ytiles, gridw*wgx, gridh*wgy); | 1375 | gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d)", |
1273 | gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)", | 1376 | width, height, xtiles, ytiles); |
1274 | wgx, wgy, gridw, gridh); | 1377 | gk20a_dbg(gpu_dbg_cde, "group (%d, %d) gridH (%d, %d) gridV (%d, %d)", |
1378 | wgx, wgy, gridw_h, gridh_h, gridw_v, gridh_v); | ||
1379 | gk20a_dbg(gpu_dbg_cde, "hprog=%d, offset=0x%x, regs=%d, vprog=%d, offset=0x%x, regs=%d", | ||
1380 | hprog, | ||
1381 | g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog], | ||
1382 | g->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog], | ||
1383 | vprog, | ||
1384 | g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog], | ||
1385 | g->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]); | ||
1275 | 1386 | ||
1276 | /* Write parameters */ | 1387 | /* Write parameters */ |
1277 | #define WRITE_PATCH(NAME, VALUE) \ | 1388 | #define WRITE_PATCH(NAME, VALUE) \ |
1278 | params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE} | 1389 | params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE} |
1279 | WRITE_PATCH(PATCH_USER_CONST_XTILES, xtiles); | 1390 | WRITE_PATCH(PATCH_USER_CONST_XBLOCKS, xblocks); |
1280 | WRITE_PATCH(PATCH_USER_CONST_YTILES, ytiles); | 1391 | WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2, |
1281 | WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2, block_height_log2); | 1392 | block_height_log2); |
1282 | WRITE_PATCH(PATCH_USER_CONST_DSTPITCH, dst_stride); | 1393 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx); |
1283 | WRITE_PATCH(PATCH_USER_CONST_DSTOFFSET, | 1394 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy); |
1284 | (transpose ? 4 : 0) | g->cde_app.shader_parameter); | ||
1285 | WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_X, gridw); | ||
1286 | WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_Y, gridh); | ||
1287 | WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_Z, 1); | ||
1288 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx); | 1395 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx); |
1289 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy); | 1396 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy); |
1290 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1); | 1397 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1); |
1291 | WRITE_PATCH(PATCH_QMD_CTA_RASTER_WIDTH, gridw); | 1398 | |
1292 | WRITE_PATCH(PATCH_QMD_CTA_RASTER_HEIGHT, gridh); | 1399 | WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_WIDTH, gridw_h); |
1293 | WRITE_PATCH(PATCH_QMD_CTA_RASTER_DEPTH, 1); | 1400 | WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_HEIGHT, gridh_h); |
1294 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx); | 1401 | WRITE_PATCH(PATCH_H_USER_CONST_DSTOFFSET, 0); |
1295 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy); | 1402 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_X, gridw_h); |
1296 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION2, 1); | 1403 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Y, gridh_h); |
1404 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Z, 1); | ||
1405 | |||
1406 | WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_WIDTH, gridw_v); | ||
1407 | WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_HEIGHT, gridh_v); | ||
1408 | WRITE_PATCH(PATCH_V_USER_CONST_DSTOFFSET, voffset); | ||
1409 | WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_X, gridw_v); | ||
1410 | WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Y, gridh_v); | ||
1411 | WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Z, 1); | ||
1412 | |||
1413 | WRITE_PATCH(PATCH_H_QMD_PROGRAM_OFFSET, | ||
1414 | g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog]); | ||
1415 | WRITE_PATCH(PATCH_H_QMD_REGISTER_COUNT, | ||
1416 | g->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog]); | ||
1417 | WRITE_PATCH(PATCH_V_QMD_PROGRAM_OFFSET, | ||
1418 | g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog]); | ||
1419 | WRITE_PATCH(PATCH_V_QMD_REGISTER_COUNT, | ||
1420 | g->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]); | ||
1421 | |||
1422 | if (consumer & NVGPU_GPU_COMPBITS_CDEH) { | ||
1423 | WRITE_PATCH(PATCH_H_LAUNCH_WORD1, | ||
1424 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]); | ||
1425 | WRITE_PATCH(PATCH_H_LAUNCH_WORD2, | ||
1426 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]); | ||
1427 | } else { | ||
1428 | WRITE_PATCH(PATCH_H_LAUNCH_WORD1, | ||
1429 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]); | ||
1430 | WRITE_PATCH(PATCH_H_LAUNCH_WORD2, | ||
1431 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]); | ||
1432 | } | ||
1433 | |||
1434 | if (consumer & NVGPU_GPU_COMPBITS_CDEV) { | ||
1435 | WRITE_PATCH(PATCH_V_LAUNCH_WORD1, | ||
1436 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]); | ||
1437 | WRITE_PATCH(PATCH_V_LAUNCH_WORD2, | ||
1438 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]); | ||
1439 | } else { | ||
1440 | WRITE_PATCH(PATCH_V_LAUNCH_WORD1, | ||
1441 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]); | ||
1442 | WRITE_PATCH(PATCH_V_LAUNCH_WORD2, | ||
1443 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]); | ||
1444 | } | ||
1297 | #undef WRITE_PATCH | 1445 | #undef WRITE_PATCH |
1298 | 1446 | ||
1299 | err = gk20a_busy(g->dev); | ||
1300 | if (err) | ||
1301 | return err; | ||
1302 | err = gk20a_cde_convert(g, dmabuf, | 1447 | err = gk20a_cde_convert(g, dmabuf, |
1303 | 0, /* dst kind */ | 1448 | 0, /* dst kind */ |
1304 | compbits_offset, | 1449 | compbits_hoffset, |
1305 | 0, /* dst_size, 0 = auto */ | 1450 | 0, /* dst_size, 0 = auto */ |
1306 | fence_in, submit_flags, | 1451 | fence_in, submit_flags, |
1307 | params, param, fence_out); | 1452 | params, param, &new_fence); |
1453 | if (err) | ||
1454 | goto out; | ||
1455 | |||
1456 | /* compbits generated, update state & fence */ | ||
1457 | gk20a_fence_put(state->fence); | ||
1458 | state->fence = new_fence; | ||
1459 | state->valid_compbits |= consumer & | ||
1460 | (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV); | ||
1461 | out: | ||
1462 | return err; | ||
1463 | } | ||
1464 | |||
1465 | static int gk20a_buffer_convert_gpu_to_cde( | ||
1466 | struct gk20a *g, struct dma_buf *dmabuf, u32 consumer, | ||
1467 | u64 offset, u64 compbits_hoffset, u64 compbits_voffset, | ||
1468 | u32 width, u32 height, u32 block_height_log2, | ||
1469 | u32 submit_flags, struct nvgpu_fence *fence_in, | ||
1470 | struct gk20a_buffer_state *state) | ||
1471 | { | ||
1472 | int err = 0; | ||
1473 | |||
1474 | if (!g->cde_app.initialised) | ||
1475 | return -ENOSYS; | ||
1476 | |||
1477 | err = gk20a_busy(g->dev); | ||
1478 | if (err) | ||
1479 | return err; | ||
1480 | |||
1481 | gk20a_dbg(gpu_dbg_cde, "firmware version = %d\n", | ||
1482 | g->cde_app.firmware_version); | ||
1483 | |||
1484 | if (g->cde_app.firmware_version == 0) { | ||
1485 | err = gk20a_buffer_convert_gpu_to_cde_v0( | ||
1486 | g, dmabuf, consumer, offset, compbits_hoffset, | ||
1487 | compbits_voffset, width, height, block_height_log2, | ||
1488 | submit_flags, fence_in, state); | ||
1489 | } else { | ||
1490 | err = gk20a_buffer_convert_gpu_to_cde_v1( | ||
1491 | g, dmabuf, consumer, offset, compbits_hoffset, | ||
1492 | compbits_voffset, width, height, block_height_log2, | ||
1493 | submit_flags, fence_in, state); | ||
1494 | } | ||
1495 | |||
1308 | gk20a_idle(g->dev); | 1496 | gk20a_idle(g->dev); |
1309 | return err; | 1497 | return err; |
1310 | } | 1498 | } |
@@ -1326,7 +1514,8 @@ int gk20a_prepare_compressible_read( | |||
1326 | if (IS_ERR(dmabuf)) | 1514 | if (IS_ERR(dmabuf)) |
1327 | return -EINVAL; | 1515 | return -EINVAL; |
1328 | 1516 | ||
1329 | err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g), offset, &state); | 1517 | err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g), |
1518 | offset, &state); | ||
1330 | if (err) { | 1519 | if (err) { |
1331 | dma_buf_put(dmabuf); | 1520 | dma_buf_put(dmabuf); |
1332 | return err; | 1521 | return err; |
@@ -1345,40 +1534,20 @@ int gk20a_prepare_compressible_read( | |||
1345 | err = -EINVAL; | 1534 | err = -EINVAL; |
1346 | goto out; | 1535 | goto out; |
1347 | } else if (missing_bits) { | 1536 | } else if (missing_bits) { |
1348 | struct gk20a_fence *new_fence = NULL; | 1537 | u32 missing_cde_bits = missing_bits & |
1538 | (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV); | ||
1349 | if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) && | 1539 | if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) && |
1350 | (missing_bits & NVGPU_GPU_COMPBITS_CDEH)) { | 1540 | missing_cde_bits) { |
1351 | err = gk20a_buffer_convert_gpu_to_cde( | 1541 | err = gk20a_buffer_convert_gpu_to_cde( |
1352 | g, dmabuf, | 1542 | g, dmabuf, |
1353 | NVGPU_GPU_COMPBITS_CDEH, | 1543 | missing_cde_bits, |
1354 | offset, compbits_hoffset, | 1544 | offset, compbits_hoffset, |
1545 | compbits_voffset, | ||
1355 | width, height, block_height_log2, | 1546 | width, height, block_height_log2, |
1356 | submit_flags, fence, | 1547 | submit_flags, fence, |
1357 | &new_fence); | 1548 | state); |
1358 | if (err) | 1549 | if (err) |
1359 | goto out; | 1550 | goto out; |
1360 | |||
1361 | /* CDEH bits generated, update state & fence */ | ||
1362 | gk20a_fence_put(state->fence); | ||
1363 | state->fence = new_fence; | ||
1364 | state->valid_compbits |= NVGPU_GPU_COMPBITS_CDEH; | ||
1365 | } | ||
1366 | if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) && | ||
1367 | (missing_bits & NVGPU_GPU_COMPBITS_CDEV)) { | ||
1368 | err = gk20a_buffer_convert_gpu_to_cde( | ||
1369 | g, dmabuf, | ||
1370 | NVGPU_GPU_COMPBITS_CDEV, | ||
1371 | offset, compbits_voffset, | ||
1372 | width, height, block_height_log2, | ||
1373 | submit_flags, fence, | ||
1374 | &new_fence); | ||
1375 | if (err) | ||
1376 | goto out; | ||
1377 | |||
1378 | /* CDEH bits generated, update state & fence */ | ||
1379 | gk20a_fence_put(state->fence); | ||
1380 | state->fence = new_fence; | ||
1381 | state->valid_compbits |= NVGPU_GPU_COMPBITS_CDEV; | ||
1382 | } | 1551 | } |
1383 | } | 1552 | } |
1384 | 1553 | ||
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h index 3347490c..b160162c 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h | |||
@@ -23,8 +23,9 @@ | |||
23 | 23 | ||
24 | #define MAX_CDE_BUFS 10 | 24 | #define MAX_CDE_BUFS 10 |
25 | #define MAX_CDE_PARAMS 64 | 25 | #define MAX_CDE_PARAMS 64 |
26 | #define MAX_CDE_USER_PARAMS 32 | 26 | #define MAX_CDE_USER_PARAMS 40 |
27 | #define MAX_CDE_OBJ_IDS 4 | 27 | #define MAX_CDE_OBJ_IDS 4 |
28 | #define MAX_CDE_ARRAY_ENTRIES 9 | ||
28 | 29 | ||
29 | /* | 30 | /* |
30 | * The size of the context ring buffer that is dedicated for handling cde | 31 | * The size of the context ring buffer that is dedicated for handling cde |
@@ -162,6 +163,22 @@ struct gk20a_cde_cmd_elem { | |||
162 | }; | 163 | }; |
163 | 164 | ||
164 | /* | 165 | /* |
166 | * This element is used for storing a small array of data. | ||
167 | */ | ||
168 | |||
169 | enum { | ||
170 | ARRAY_PROGRAM_OFFSET = 0, | ||
171 | ARRAY_REGISTER_COUNT, | ||
172 | ARRAY_LAUNCH_COMMAND, | ||
173 | NUM_CDE_ARRAYS | ||
174 | }; | ||
175 | |||
176 | struct gk20a_cde_hdr_array { | ||
177 | u32 id; | ||
178 | u32 data[MAX_CDE_ARRAY_ENTRIES]; | ||
179 | }; | ||
180 | |||
181 | /* | ||
165 | * Following defines a single header element. Each element has a type and | 182 | * Following defines a single header element. Each element has a type and |
166 | * some of the data structures. | 183 | * some of the data structures. |
167 | */ | 184 | */ |
@@ -175,6 +192,7 @@ struct gk20a_cde_hdr_elem { | |||
175 | struct gk20a_cde_hdr_param param; | 192 | struct gk20a_cde_hdr_param param; |
176 | u32 required_class; | 193 | u32 required_class; |
177 | struct gk20a_cde_hdr_command command; | 194 | struct gk20a_cde_hdr_command command; |
195 | struct gk20a_cde_hdr_array array; | ||
178 | }; | 196 | }; |
179 | }; | 197 | }; |
180 | 198 | ||
@@ -183,7 +201,8 @@ enum { | |||
183 | TYPE_REPLACE, | 201 | TYPE_REPLACE, |
184 | TYPE_PARAM, | 202 | TYPE_PARAM, |
185 | TYPE_REQUIRED_CLASS, | 203 | TYPE_REQUIRED_CLASS, |
186 | TYPE_COMMAND | 204 | TYPE_COMMAND, |
205 | TYPE_ARRAY | ||
187 | }; | 206 | }; |
188 | 207 | ||
189 | struct gk20a_cde_mem_desc { | 208 | struct gk20a_cde_mem_desc { |
@@ -219,14 +238,12 @@ struct gk20a_cde_ctx { | |||
219 | /* storage for user space parameter values */ | 238 | /* storage for user space parameter values */ |
220 | u32 user_param_values[MAX_CDE_USER_PARAMS]; | 239 | u32 user_param_values[MAX_CDE_USER_PARAMS]; |
221 | 240 | ||
222 | u64 src_smmu_addr; | 241 | u32 surf_param_offset; |
223 | u32 src_param_offset; | 242 | u32 surf_param_lines; |
224 | u32 src_param_lines; | 243 | u64 surf_vaddr; |
225 | 244 | ||
226 | u64 src_vaddr; | 245 | u64 compbit_vaddr; |
227 | 246 | u64 compbit_size; | |
228 | u64 dest_vaddr; | ||
229 | u64 dest_size; | ||
230 | 247 | ||
231 | u32 obj_ids[MAX_CDE_OBJ_IDS]; | 248 | u32 obj_ids[MAX_CDE_OBJ_IDS]; |
232 | int num_obj_ids; | 249 | int num_obj_ids; |
@@ -259,6 +276,10 @@ struct gk20a_cde_app { | |||
259 | int ctx_usecount; | 276 | int ctx_usecount; |
260 | int ctx_count_top; | 277 | int ctx_count_top; |
261 | 278 | ||
279 | u32 firmware_version; | ||
280 | |||
281 | u32 arrays[NUM_CDE_ARRAYS][MAX_CDE_ARRAY_ENTRIES]; | ||
282 | |||
262 | u32 shader_parameter; | 283 | u32 shader_parameter; |
263 | }; | 284 | }; |
264 | 285 | ||
@@ -266,9 +287,9 @@ void gk20a_cde_destroy(struct gk20a *g); | |||
266 | void gk20a_cde_suspend(struct gk20a *g); | 287 | void gk20a_cde_suspend(struct gk20a *g); |
267 | int gk20a_init_cde_support(struct gk20a *g); | 288 | int gk20a_init_cde_support(struct gk20a *g); |
268 | int gk20a_cde_reload(struct gk20a *g); | 289 | int gk20a_cde_reload(struct gk20a *g); |
269 | int gk20a_cde_convert(struct gk20a *g, struct dma_buf *dst, | 290 | int gk20a_cde_convert(struct gk20a *g, struct dma_buf *compbits_buf, |
270 | s32 dst_kind, u64 dst_word_offset, | 291 | s32 compbits_kind, u64 compbits_word_offset, |
271 | u32 dst_size, struct nvgpu_fence *fence, | 292 | u32 compbits_size, struct nvgpu_fence *fence, |
272 | u32 __flags, struct gk20a_cde_param *params, | 293 | u32 __flags, struct gk20a_cde_param *params, |
273 | int num_params, struct gk20a_fence **fence_out); | 294 | int num_params, struct gk20a_fence **fence_out); |
274 | void gk20a_cde_debugfs_init(struct platform_device *dev); | 295 | void gk20a_cde_debugfs_init(struct platform_device *dev); |
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index a390e36b..08dd41c5 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c | |||
@@ -1546,7 +1546,7 @@ u64 gk20a_gmmu_map(struct vm_gk20a *vm, | |||
1546 | return vaddr; | 1546 | return vaddr; |
1547 | } | 1547 | } |
1548 | 1548 | ||
1549 | dma_addr_t gk20a_mm_gpuva_to_iova(struct vm_gk20a *vm, u64 gpu_vaddr) | 1549 | dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr) |
1550 | { | 1550 | { |
1551 | struct mapped_buffer_node *buffer; | 1551 | struct mapped_buffer_node *buffer; |
1552 | dma_addr_t addr = 0; | 1552 | dma_addr_t addr = 0; |
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index 3f7042ee..efed79f8 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h | |||
@@ -530,7 +530,7 @@ int gk20a_vm_map_buffer(struct gk20a_as_share *as_share, | |||
530 | int gk20a_vm_unmap_buffer(struct gk20a_as_share *, u64 offset); | 530 | int gk20a_vm_unmap_buffer(struct gk20a_as_share *, u64 offset); |
531 | void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf, | 531 | void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf, |
532 | struct gk20a_comptags *comptags); | 532 | struct gk20a_comptags *comptags); |
533 | dma_addr_t gk20a_mm_gpuva_to_iova(struct vm_gk20a *vm, u64 gpu_vaddr); | 533 | dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr); |
534 | 534 | ||
535 | int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev); | 535 | int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev); |
536 | 536 | ||