summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a
diff options
context:
space:
mode:
authorJussi Rasanen <jrasanen@nvidia.com>2014-10-03 05:44:05 -0400
committerDan Willemsen <dwillemsen@nvidia.com>2015-03-18 15:12:06 -0400
commit529962911c2e9b5c4e3a95b6c78dba8f15447a93 (patch)
treee439afdb7203e0810e543711b4333ede8f002b31 /drivers/gpu/nvgpu/gk20a
parent6e22f39e8747a8ab9c720ef2e5236e5c94767f88 (diff)
gpu: nvgpu: cde: Combine H and V passes
When using CDE firmware v1, combine H and V swizzling passes into one pushbuffer submission. This removes one GPU context switch, almost halving the time taken for swizzling. Map only the compbit part of the destination surface. Bug 1546619 Change-Id: I95ed4e4c2eefd6d24a58854d31929cdb91ff556b Signed-off-by: Jussi Rasanen <jrasanen@nvidia.com> Reviewed-on: http://git-master/r/553234 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a')
-rw-r--r--drivers/gpu/nvgpu/gk20a/cde_gk20a.c541
-rw-r--r--drivers/gpu/nvgpu/gk20a/cde_gk20a.h45
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h2
4 files changed, 390 insertions, 200 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index 472cc81c..8b2ed55e 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -337,8 +337,8 @@ static int gk20a_replace_data(struct gk20a_cde_ctx *cde_ctx, void *target,
337} 337}
338 338
339static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx, 339static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx,
340 const struct firmware *img, 340 const struct firmware *img,
341 struct gk20a_cde_hdr_replace *replace) 341 struct gk20a_cde_hdr_replace *replace)
342{ 342{
343 struct gk20a_cde_mem_desc *source_mem; 343 struct gk20a_cde_mem_desc *source_mem;
344 struct gk20a_cde_mem_desc *target_mem; 344 struct gk20a_cde_mem_desc *target_mem;
@@ -410,26 +410,26 @@ static int gk20a_cde_patch_params(struct gk20a_cde_ctx *cde_ctx)
410 g->gr.cacheline_size; 410 g->gr.cacheline_size;
411 break; 411 break;
412 case TYPE_PARAM_FIRSTPAGEOFFSET: 412 case TYPE_PARAM_FIRSTPAGEOFFSET:
413 new_data = cde_ctx->src_param_offset; 413 new_data = cde_ctx->surf_param_offset;
414 break; 414 break;
415 case TYPE_PARAM_NUMPAGES: 415 case TYPE_PARAM_NUMPAGES:
416 new_data = cde_ctx->src_param_lines; 416 new_data = cde_ctx->surf_param_lines;
417 break; 417 break;
418 case TYPE_PARAM_BACKINGSTORE: 418 case TYPE_PARAM_BACKINGSTORE:
419 new_data = cde_ctx->backing_store_vaddr; 419 new_data = cde_ctx->backing_store_vaddr;
420 break; 420 break;
421 case TYPE_PARAM_DESTINATION: 421 case TYPE_PARAM_DESTINATION:
422 new_data = cde_ctx->dest_vaddr; 422 new_data = cde_ctx->compbit_vaddr;
423 break; 423 break;
424 case TYPE_PARAM_DESTINATION_SIZE: 424 case TYPE_PARAM_DESTINATION_SIZE:
425 new_data = cde_ctx->dest_size; 425 new_data = cde_ctx->compbit_size;
426 break; 426 break;
427 case TYPE_PARAM_BACKINGSTORE_SIZE: 427 case TYPE_PARAM_BACKINGSTORE_SIZE:
428 new_data = g->gr.compbit_store.size; 428 new_data = g->gr.compbit_store.size;
429 break; 429 break;
430 case TYPE_PARAM_SOURCE_SMMU_ADDR: 430 case TYPE_PARAM_SOURCE_SMMU_ADDR:
431 new_data = gk20a_mm_gpuva_to_iova(cde_ctx->vm, 431 new_data = gk20a_mm_gpuva_to_iova_base(cde_ctx->vm,
432 cde_ctx->src_vaddr); 432 cde_ctx->surf_vaddr);
433 if (new_data == 0) 433 if (new_data == 0)
434 err = -EINVAL; 434 err = -EINVAL;
435 break; 435 break;
@@ -605,8 +605,9 @@ static int gk20a_init_cde_command(struct gk20a_cde_ctx *cde_ctx,
605static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx, 605static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx,
606 const struct firmware *img) 606 const struct firmware *img)
607{ 607{
608 struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
608 u32 *data = (u32 *)img->data; 609 u32 *data = (u32 *)img->data;
609 u32 version, num_of_elems; 610 u32 num_of_elems;
610 struct gk20a_cde_hdr_elem *elem; 611 struct gk20a_cde_hdr_elem *elem;
611 u32 min_size = 0; 612 u32 min_size = 0;
612 int err = 0; 613 int err = 0;
@@ -618,7 +619,7 @@ static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx,
618 return -EINVAL; 619 return -EINVAL;
619 } 620 }
620 621
621 version = data[0]; 622 cde_app->firmware_version = data[0];
622 num_of_elems = data[1]; 623 num_of_elems = data[1];
623 624
624 min_size += num_of_elems * sizeof(*elem); 625 min_size += num_of_elems * sizeof(*elem);
@@ -654,6 +655,11 @@ static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx,
654 elem->command.num_entries); 655 elem->command.num_entries);
655 break; 656 break;
656 } 657 }
658 case TYPE_ARRAY:
659 memcpy(&cde_app->arrays[elem->array.id][0],
660 elem->array.data,
661 MAX_CDE_ARRAY_ENTRIES*sizeof(u32));
662 break;
657 default: 663 default:
658 gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown header element"); 664 gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown header element");
659 err = -EINVAL; 665 err = -EINVAL;
@@ -853,27 +859,25 @@ static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g)
853} 859}
854 860
855int gk20a_cde_convert(struct gk20a *g, 861int gk20a_cde_convert(struct gk20a *g,
856 struct dma_buf *dst, 862 struct dma_buf *compbits_buf,
857 s32 dst_kind, u64 dst_byte_offset, 863 s32 compbits_kind, u64 compbits_byte_offset,
858 u32 dst_size, struct nvgpu_fence *fence, 864 u32 compbits_size, struct nvgpu_fence *fence,
859 u32 __flags, struct gk20a_cde_param *params, 865 u32 __flags, struct gk20a_cde_param *params,
860 int num_params, struct gk20a_fence **fence_out) 866 int num_params, struct gk20a_fence **fence_out)
861__acquires(&cde_app->mutex) 867__acquires(&cde_app->mutex)
862__releases(&cde_app->mutex) 868__releases(&cde_app->mutex)
863{ 869{
864 struct gk20a_cde_app *cde_app = &g->cde_app; 870 struct gk20a_cde_ctx *cde_ctx = NULL;
865 struct gk20a_comptags comptags; 871 struct gk20a_comptags comptags;
866 struct gk20a_cde_ctx *cde_ctx; 872 u64 compbits_offset = 0;
867 u64 dst_vaddr = 0; 873 u64 map_vaddr = 0;
874 u64 map_offset = 0;
875 u32 map_size = 0;
876 u64 big_page_mask = 0;
868 u32 flags; 877 u32 flags;
869 int err, i; 878 int err, i;
870 879
871 if (!cde_app->initialised) { 880 mutex_lock(&g->cde_app.mutex);
872 gk20a_warn(&g->dev->dev, "cde: conversion requrest but no image has been provided");
873 return -ENOSYS;
874 }
875
876 mutex_lock(&cde_app->mutex);
877 881
878 cde_ctx = gk20a_cde_get_context(g); 882 cde_ctx = gk20a_cde_get_context(g);
879 if (IS_ERR(cde_ctx)) { 883 if (IS_ERR(cde_ctx)) {
@@ -881,38 +885,53 @@ __releases(&cde_app->mutex)
881 goto exit_unlock; 885 goto exit_unlock;
882 } 886 }
883 887
884 /* First, map the buffers to local va */ 888 /* First, map the buffer to local va */
885 889
886 /* ensure that the dst buffer has drvdata */ 890 /* ensure that the compbits buffer has drvdata */
887 err = gk20a_dmabuf_alloc_drvdata(dst, &g->dev->dev); 891 err = gk20a_dmabuf_alloc_drvdata(compbits_buf, &g->dev->dev);
888 if (err) 892 if (err)
889 goto exit_unlock; 893 goto exit_unlock;
890 894
895 /* compbits don't start at page aligned offset, so we need to align
896 the region to be mapped */
897 big_page_mask = cde_ctx->vm->big_page_size - 1;
898 map_offset = compbits_byte_offset & ~big_page_mask;
899
900 /* compute compbit start offset from the beginning of the mapped
901 area */
902 compbits_offset = compbits_byte_offset & big_page_mask;
903
904 if (!compbits_size) {
905 compbits_size = compbits_buf->size - compbits_byte_offset;
906 map_size = compbits_buf->size - map_offset;
907 }
908
891 /* map the destination buffer */ 909 /* map the destination buffer */
892 get_dma_buf(dst); /* a ref for gk20a_vm_map */ 910 get_dma_buf(compbits_buf); /* a ref for gk20a_vm_map */
893 dst_vaddr = gk20a_vm_map(cde_ctx->vm, dst, 0, 911 map_vaddr = gk20a_vm_map(cde_ctx->vm, compbits_buf, 0,
894 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 912 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
895 dst_kind, NULL, true, 913 compbits_kind, NULL, true,
896 gk20a_mem_flag_none, 914 gk20a_mem_flag_none,
897 0, 0); 915 map_offset, map_size);
898 if (!dst_vaddr) { 916 if (!map_vaddr) {
899 dma_buf_put(dst); 917 dma_buf_put(compbits_buf);
900 err = -EINVAL; 918 err = -EINVAL;
901 goto exit_unlock; 919 goto exit_unlock;
902 } 920 }
903 921
904 if (!dst_size)
905 dst_size = dst->size - dst_byte_offset;
906
907 /* store source buffer compression tags */ 922 /* store source buffer compression tags */
908 gk20a_get_comptags(&g->dev->dev, dst, &comptags); 923 gk20a_get_comptags(&g->dev->dev, compbits_buf, &comptags);
909 cde_ctx->src_vaddr = dst_vaddr; 924 cde_ctx->surf_param_offset = comptags.offset;
910 cde_ctx->src_param_offset = comptags.offset; 925 cde_ctx->surf_param_lines = comptags.lines;
911 cde_ctx->src_param_lines = comptags.lines; 926
927 /* store surface vaddr. This is actually compbit vaddr, but since
928 compbits live in the same surface, and we can get the alloc base
929 address by using gk20a_mm_gpuva_to_iova_base, this will do */
930 cde_ctx->surf_vaddr = map_vaddr;
912 931
913 /* store information about destination */ 932 /* store information about destination */
914 cde_ctx->dest_vaddr = dst_vaddr + dst_byte_offset; 933 cde_ctx->compbit_vaddr = map_vaddr + compbits_offset;
915 cde_ctx->dest_size = dst_size; 934 cde_ctx->compbit_size = compbits_size;
916 935
917 /* remove existing argument data */ 936 /* remove existing argument data */
918 memset(cde_ctx->user_param_values, 0, 937 memset(cde_ctx->user_param_values, 0,
@@ -940,8 +959,8 @@ __releases(&cde_app->mutex)
940 959
941 gk20a_dbg(gpu_dbg_cde, "cde: buffer=cbc, size=%zu, gpuva=%llx\n", 960 gk20a_dbg(gpu_dbg_cde, "cde: buffer=cbc, size=%zu, gpuva=%llx\n",
942 g->gr.compbit_store.size, cde_ctx->backing_store_vaddr); 961 g->gr.compbit_store.size, cde_ctx->backing_store_vaddr);
943 gk20a_dbg(gpu_dbg_cde, "cde: buffer=dst, size=%llu, gpuva=%llx\n", 962 gk20a_dbg(gpu_dbg_cde, "cde: buffer=compbits, size=%llu, gpuva=%llx\n",
944 cde_ctx->dest_size, cde_ctx->dest_vaddr); 963 cde_ctx->compbit_size, cde_ctx->compbit_vaddr);
945 964
946 /* execute the init push buffer */ 965 /* execute the init push buffer */
947 if (!cde_ctx->init_cmd_executed) { 966 if (!cde_ctx->init_cmd_executed) {
@@ -964,11 +983,10 @@ __releases(&cde_app->mutex)
964exit_unlock: 983exit_unlock:
965 984
966 /* unmap the buffers - channel holds references to them now */ 985 /* unmap the buffers - channel holds references to them now */
967 if (dst_vaddr) 986 if (map_vaddr)
968 gk20a_vm_unmap(cde_ctx->vm, dst_vaddr); 987 gk20a_vm_unmap(cde_ctx->vm, map_vaddr);
969
970 mutex_unlock(&cde_app->mutex);
971 988
989 mutex_unlock(&g->cde_app.mutex);
972 return err; 990 return err;
973} 991}
974 992
@@ -1159,152 +1177,322 @@ __releases(&cde_app->mutex)
1159 return err; 1177 return err;
1160} 1178}
1161 1179
1162enum cde_launch_patch_offset {
1163 /* dst buffer width in roptiles */
1164 PATCH_USER_CONST_XTILES,
1165 /* dst buffer height in roptiles */
1166 PATCH_USER_CONST_YTILES,
1167 /* dst buffer log2(block height) */
1168 PATCH_USER_CONST_BLOCKHEIGHTLOG2,
1169 /* dst buffer pitch in bytes */
1170 PATCH_USER_CONST_DSTPITCH,
1171 /* dst buffer write offset */
1172 PATCH_USER_CONST_DSTOFFSET,
1173 /* comp cache index of the first page of the surface,
1174 * kernel looks it up from PTE */
1175 PATCH_USER_CONST_FIRSTPAGEOFFSET,
1176 /* gmmu translated surface address, kernel fills */
1177 PATCH_USER_CONST_SURFADDR,
1178 /* dst buffer address >> 8, kernel fills */
1179 PATCH_VPC_DSTIMAGE_ADDR,
1180 /* dst buffer address >> 8, kernel fills */
1181 PATCH_VPC_DSTIMAGE_ADDR2,
1182 /* dst buffer size - 1, kernel fills */
1183 PATCH_VPC_DSTIMAGE_SIZE_MINUS_ONE,
1184 /* dst buffer size - 1, kernel fills */
1185 PATCH_VPC_DSTIMAGE_SIZE_MINUS_ONE2,
1186 /* dst buffer size, kernel fills */
1187 PATCH_VPC_DSTIMAGE_SIZE,
1188 /* dst buffer width in roptiles / work group width */
1189 PATCH_VPC_CURRENT_GRID_SIZE_X,
1190 /* dst buffer height in roptiles / work group height */
1191 PATCH_VPC_CURRENT_GRID_SIZE_Y,
1192 /* 1 */
1193 PATCH_VPC_CURRENT_GRID_SIZE_Z,
1194 /* work group width, 16 seems to be quite optimal */
1195 PATCH_VPC_CURRENT_GROUP_SIZE_X,
1196 /* work group height, 8 seems to be quite optimal */
1197 PATCH_VPC_CURRENT_GROUP_SIZE_Y,
1198 /* 1 */
1199 PATCH_VPC_CURRENT_GROUP_SIZE_Z,
1200 /* same as PATCH_VPC_CURRENT_GRID_SIZE_X */
1201 PATCH_QMD_CTA_RASTER_WIDTH,
1202 /* same as PATCH_VPC_CURRENT_GRID_SIZE_Y */
1203 PATCH_QMD_CTA_RASTER_HEIGHT,
1204 /* same as PATCH_VPC_CURRENT_GRID_SIZE_Z */
1205 PATCH_QMD_CTA_RASTER_DEPTH,
1206 /* same as PATCH_VPC_CURRENT_GROUP_SIZE_X */
1207 PATCH_QMD_CTA_THREAD_DIMENSION0,
1208 /* same as PATCH_VPC_CURRENT_GROUP_SIZE_Y */
1209 PATCH_QMD_CTA_THREAD_DIMENSION1,
1210 /* same as PATCH_VPC_CURRENT_GROUP_SIZE_Z */
1211 PATCH_QMD_CTA_THREAD_DIMENSION2,
1212
1213 NUM_CDE_LAUNCH_PATCHES
1214};
1215
1216enum cde_launch_patch_id { 1180enum cde_launch_patch_id {
1217 PATCH_QMD_CTA_RASTER_WIDTH_ID = 1024, 1181 PATCH_H_QMD_CTA_RASTER_WIDTH_ID = 1024,
1218 PATCH_QMD_CTA_RASTER_HEIGHT_ID = 1025, 1182 PATCH_H_QMD_CTA_RASTER_HEIGHT_ID = 1025,
1219 PATCH_QMD_CTA_RASTER_DEPTH_ID = 1026, 1183 PATCH_QMD_CTA_RASTER_DEPTH_ID = 1026, /* for firmware v0 only */
1220 PATCH_QMD_CTA_THREAD_DIMENSION0_ID = 1027, 1184 PATCH_QMD_CTA_THREAD_DIMENSION0_ID = 1027,
1221 PATCH_QMD_CTA_THREAD_DIMENSION1_ID = 1028, 1185 PATCH_QMD_CTA_THREAD_DIMENSION1_ID = 1028,
1222 PATCH_QMD_CTA_THREAD_DIMENSION2_ID = 1029, 1186 PATCH_QMD_CTA_THREAD_DIMENSION2_ID = 1029, /* for firmware v0 only */
1223 PATCH_USER_CONST_XTILES_ID = 1030, 1187 PATCH_USER_CONST_XTILES_ID = 1030, /* for firmware v0 only */
1224 PATCH_USER_CONST_YTILES_ID = 1031, 1188 PATCH_USER_CONST_YTILES_ID = 1031, /* for firmware v0 only */
1225 PATCH_USER_CONST_BLOCKHEIGHTLOG2_ID = 1032, 1189 PATCH_USER_CONST_BLOCKHEIGHTLOG2_ID = 1032,
1226 PATCH_USER_CONST_DSTPITCH_ID = 1033, 1190 PATCH_USER_CONST_DSTPITCH_ID = 1033, /* for firmware v0 only */
1227 PATCH_USER_CONST_DSTOFFSET_ID = 1034, 1191 PATCH_H_USER_CONST_FLAGS_ID = 1034, /* for firmware v0 only */
1228 PATCH_VPC_CURRENT_GRID_SIZE_X_ID = 1035, 1192 PATCH_H_VPC_CURRENT_GRID_SIZE_X_ID = 1035,
1229 PATCH_VPC_CURRENT_GRID_SIZE_Y_ID = 1036, 1193 PATCH_H_VPC_CURRENT_GRID_SIZE_Y_ID = 1036,
1230 PATCH_VPC_CURRENT_GRID_SIZE_Z_ID = 1037, 1194 PATCH_H_VPC_CURRENT_GRID_SIZE_Z_ID = 1037,
1231 PATCH_VPC_CURRENT_GROUP_SIZE_X_ID = 1038, 1195 PATCH_VPC_CURRENT_GROUP_SIZE_X_ID = 1038,
1232 PATCH_VPC_CURRENT_GROUP_SIZE_Y_ID = 1039, 1196 PATCH_VPC_CURRENT_GROUP_SIZE_Y_ID = 1039,
1233 PATCH_VPC_CURRENT_GROUP_SIZE_Z_ID = 1040, 1197 PATCH_VPC_CURRENT_GROUP_SIZE_Z_ID = 1040,
1198 PATCH_USER_CONST_XBLOCKS_ID = 1041,
1199 PATCH_H_USER_CONST_DSTOFFSET_ID = 1042,
1200 PATCH_V_QMD_CTA_RASTER_WIDTH_ID = 1043,
1201 PATCH_V_QMD_CTA_RASTER_HEIGHT_ID = 1044,
1202 PATCH_V_USER_CONST_DSTOFFSET_ID = 1045,
1203 PATCH_V_VPC_CURRENT_GRID_SIZE_X_ID = 1046,
1204 PATCH_V_VPC_CURRENT_GRID_SIZE_Y_ID = 1047,
1205 PATCH_V_VPC_CURRENT_GRID_SIZE_Z_ID = 1048,
1206 PATCH_H_LAUNCH_WORD1_ID = 1049,
1207 PATCH_H_LAUNCH_WORD2_ID = 1050,
1208 PATCH_V_LAUNCH_WORD1_ID = 1051,
1209 PATCH_V_LAUNCH_WORD2_ID = 1052,
1210 PATCH_H_QMD_PROGRAM_OFFSET_ID = 1053,
1211 PATCH_H_QMD_REGISTER_COUNT_ID = 1054,
1212 PATCH_V_QMD_PROGRAM_OFFSET_ID = 1055,
1213 PATCH_V_QMD_REGISTER_COUNT_ID = 1056,
1234}; 1214};
1235 1215
1236static int gk20a_buffer_convert_gpu_to_cde( 1216enum programs {
1237 struct gk20a *g, struct dma_buf *dmabuf, u32 consumer, 1217 PROG_HPASS = 0,
1238 u64 offset, u64 compbits_offset, 1218 PROG_VPASS_LARGE = 1,
1219 PROG_VPASS_SMALL = 2,
1220 PROG_HPASS_DEBUG = 3,
1221 PROG_VPASS_LARGE_DEBUG = 4,
1222 PROG_VPASS_SMALL_DEBUG = 5,
1223 PROG_PASSTHROUGH = 6,
1224 NUM_PROGRAMS = 7
1225};
1226
1227/* maximum number of WRITE_PATCHes in the below function */
1228#define MAX_CDE_LAUNCH_PATCHES 32
1229
1230static int gk20a_buffer_convert_gpu_to_cde_v0(
1231 struct gk20a *g,
1232 struct dma_buf *dmabuf, u32 consumer,
1233 u64 offset, u64 compbits_hoffset, u64 compbits_voffset,
1239 u32 width, u32 height, u32 block_height_log2, 1234 u32 width, u32 height, u32 block_height_log2,
1240 u32 submit_flags, struct nvgpu_fence *fence_in, 1235 u32 submit_flags, struct nvgpu_fence *fence_in,
1241 struct gk20a_fence **fence_out) 1236 struct gk20a_buffer_state *state)
1242{ 1237{
1243 struct gk20a_cde_param params[NUM_CDE_LAUNCH_PATCHES]; 1238 struct gk20a_cde_param params[MAX_CDE_LAUNCH_PATCHES];
1244 int param = 0; 1239 int param = 0;
1245 int err = 0; 1240 int err = 0;
1241 struct gk20a_fence *new_fence = NULL;
1242 const int wgx = 8;
1243 const int wgy = 8;
1244 const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */
1245 const int xalign = compbits_per_byte * wgx;
1246 const int yalign = wgy;
1246 1247
1247 /* Compute per launch parameters */ 1248 /* firmware v0 needs to call swizzling twice */
1248 const bool transpose = (consumer == NVGPU_GPU_COMPBITS_CDEV); 1249 int i;
1249 const int transposed_width = transpose ? height : width; 1250 for (i = 0; i < 2; i++) {
1250 const int transposed_height = transpose ? width : height; 1251 /* Compute per launch parameters */
1251 const int xtiles = (transposed_width + 7) >> 3; 1252 const bool vpass = (i == 1);
1252 const int ytiles = (transposed_height + 7) >> 3; 1253 const int transposed_width = vpass ? height : width;
1254 const int transposed_height = vpass ? width : height;
1255 const int xtiles = (transposed_width + 7) >> 3;
1256 const int ytiles = (transposed_height + 7) >> 3;
1257 const int gridw = roundup(xtiles, xalign) / xalign;
1258 const int gridh = roundup(ytiles, yalign) / yalign;
1259 const int flags = (vpass ? 4 : 0) |
1260 g->cde_app.shader_parameter;
1261 const int dst_stride = 128; /* chip constant */
1262
1263 if ((vpass && !(consumer & NVGPU_GPU_COMPBITS_CDEV)) ||
1264 (!vpass && !(consumer & NVGPU_GPU_COMPBITS_CDEH)))
1265 continue;
1266
1267 if (xtiles > 4096 / 8 || ytiles > 4096 / 8)
1268 gk20a_warn(&g->dev->dev, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)",
1269 xtiles, ytiles);
1270
1271 gk20a_dbg(gpu_dbg_cde, "pass=%c", vpass ? 'V' : 'H');
1272 gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_hoffset=0x%llx, compbits_voffset=0x%llx",
1273 width, height, block_height_log2,
1274 compbits_hoffset, compbits_voffset);
1275 gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d)",
1276 width, height, xtiles, ytiles);
1277 gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)",
1278 wgx, wgy, gridw, gridh);
1279
1280 /* Write parameters */
1281#define WRITE_PATCH(NAME, VALUE) \
1282 params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE}
1283 param = 0;
1284 WRITE_PATCH(PATCH_USER_CONST_XTILES, xtiles);
1285 WRITE_PATCH(PATCH_USER_CONST_YTILES, ytiles);
1286 WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2,
1287 block_height_log2);
1288 WRITE_PATCH(PATCH_USER_CONST_DSTPITCH, dst_stride);
1289 WRITE_PATCH(PATCH_H_USER_CONST_FLAGS, flags);
1290 WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_X, gridw);
1291 WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Y, gridh);
1292 WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Z, 1);
1293 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx);
1294 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy);
1295 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1);
1296 WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_WIDTH, gridw);
1297 WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_HEIGHT, gridh);
1298 WRITE_PATCH(PATCH_QMD_CTA_RASTER_DEPTH, 1);
1299 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx);
1300 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy);
1301 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION2, 1);
1302#undef WRITE_PATCH
1303
1304 err = gk20a_cde_convert(g, dmabuf,
1305 0, /* dst kind */
1306 vpass ?
1307 compbits_voffset :
1308 compbits_hoffset,
1309 0, /* dst_size, 0 = auto */
1310 fence_in, submit_flags,
1311 params, param,
1312 &new_fence);
1313 if (err)
1314 goto out;
1315
1316 /* compbits generated, update state & fence */
1317 gk20a_fence_put(state->fence);
1318 state->fence = new_fence;
1319 state->valid_compbits |= vpass ?
1320 NVGPU_GPU_COMPBITS_CDEV :
1321 NVGPU_GPU_COMPBITS_CDEH;
1322 }
1323out:
1324 return err;
1325}
1326
1327static int gk20a_buffer_convert_gpu_to_cde_v1(
1328 struct gk20a *g,
1329 struct dma_buf *dmabuf, u32 consumer,
1330 u64 offset, u64 compbits_hoffset, u64 compbits_voffset,
1331 u32 width, u32 height, u32 block_height_log2,
1332 u32 submit_flags, struct nvgpu_fence *fence_in,
1333 struct gk20a_buffer_state *state)
1334{
1335 struct gk20a_cde_param params[MAX_CDE_LAUNCH_PATCHES];
1336 int param = 0;
1337 int err = 0;
1338 struct gk20a_fence *new_fence = NULL;
1253 const int wgx = 8; 1339 const int wgx = 8;
1254 const int wgy = 8; 1340 const int wgy = 8;
1255 const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ 1341 const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */
1256 const int dst_stride = 128; /* TODO chip constant */
1257 const int xalign = compbits_per_byte * wgx; 1342 const int xalign = compbits_per_byte * wgx;
1258 const int yalign = wgy; 1343 const int yalign = wgy;
1259 const int gridw = roundup(xtiles, xalign) / xalign;
1260 const int gridh = roundup(ytiles, yalign) / yalign;
1261 1344
1262 if (!g->cde_app.initialised) 1345 /* Compute per launch parameters */
1263 return -ENOSYS; 1346 const int xtiles = (width + 7) >> 3;
1347 const int ytiles = (height + 7) >> 3;
1348 const int gridw_h = roundup(xtiles, xalign) / xalign;
1349 const int gridh_h = roundup(ytiles, yalign) / yalign;
1350 const int gridw_v = roundup(ytiles, xalign) / xalign;
1351 const int gridh_v = roundup(xtiles, yalign) / yalign;
1352 const int xblocks = (xtiles + 1) >> 1;
1353 const int voffset = compbits_voffset - compbits_hoffset;
1354
1355 int hprog = PROG_HPASS;
1356 int vprog = (block_height_log2 >= 2) ?
1357 PROG_VPASS_LARGE : PROG_VPASS_SMALL;
1358 if (g->cde_app.shader_parameter == 1) {
1359 hprog = PROG_PASSTHROUGH;
1360 vprog = PROG_PASSTHROUGH;
1361 } else if (g->cde_app.shader_parameter == 2) {
1362 hprog = PROG_HPASS_DEBUG;
1363 vprog = (block_height_log2 >= 2) ?
1364 PROG_VPASS_LARGE_DEBUG :
1365 PROG_VPASS_SMALL_DEBUG;
1366 }
1264 1367
1265 if (xtiles > 4096 / 8 || ytiles > 4096 / 8) 1368 if (xtiles > 4096 / 8 || ytiles > 4096 / 8)
1266 gk20a_warn(&g->dev->dev, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)", 1369 gk20a_warn(&g->dev->dev, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)",
1267 xtiles, ytiles); 1370 xtiles, ytiles);
1268 1371
1269 gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx", 1372 gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_hoffset=0x%llx, compbits_voffset=0x%llx",
1270 width, height, block_height_log2, compbits_offset); 1373 width, height, block_height_log2,
1271 gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)", 1374 compbits_hoffset, compbits_voffset);
1272 width, height, xtiles, ytiles, gridw*wgx, gridh*wgy); 1375 gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d)",
1273 gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)", 1376 width, height, xtiles, ytiles);
1274 wgx, wgy, gridw, gridh); 1377 gk20a_dbg(gpu_dbg_cde, "group (%d, %d) gridH (%d, %d) gridV (%d, %d)",
1378 wgx, wgy, gridw_h, gridh_h, gridw_v, gridh_v);
1379 gk20a_dbg(gpu_dbg_cde, "hprog=%d, offset=0x%x, regs=%d, vprog=%d, offset=0x%x, regs=%d",
1380 hprog,
1381 g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog],
1382 g->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog],
1383 vprog,
1384 g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog],
1385 g->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
1275 1386
1276 /* Write parameters */ 1387 /* Write parameters */
1277#define WRITE_PATCH(NAME, VALUE) \ 1388#define WRITE_PATCH(NAME, VALUE) \
1278 params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE} 1389 params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE}
1279 WRITE_PATCH(PATCH_USER_CONST_XTILES, xtiles); 1390 WRITE_PATCH(PATCH_USER_CONST_XBLOCKS, xblocks);
1280 WRITE_PATCH(PATCH_USER_CONST_YTILES, ytiles); 1391 WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2,
1281 WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2, block_height_log2); 1392 block_height_log2);
1282 WRITE_PATCH(PATCH_USER_CONST_DSTPITCH, dst_stride); 1393 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx);
1283 WRITE_PATCH(PATCH_USER_CONST_DSTOFFSET, 1394 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy);
1284 (transpose ? 4 : 0) | g->cde_app.shader_parameter);
1285 WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_X, gridw);
1286 WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_Y, gridh);
1287 WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_Z, 1);
1288 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx); 1395 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx);
1289 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy); 1396 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy);
1290 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1); 1397 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1);
1291 WRITE_PATCH(PATCH_QMD_CTA_RASTER_WIDTH, gridw); 1398
1292 WRITE_PATCH(PATCH_QMD_CTA_RASTER_HEIGHT, gridh); 1399 WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_WIDTH, gridw_h);
1293 WRITE_PATCH(PATCH_QMD_CTA_RASTER_DEPTH, 1); 1400 WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_HEIGHT, gridh_h);
1294 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx); 1401 WRITE_PATCH(PATCH_H_USER_CONST_DSTOFFSET, 0);
1295 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy); 1402 WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_X, gridw_h);
1296 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION2, 1); 1403 WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Y, gridh_h);
1404 WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Z, 1);
1405
1406 WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_WIDTH, gridw_v);
1407 WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_HEIGHT, gridh_v);
1408 WRITE_PATCH(PATCH_V_USER_CONST_DSTOFFSET, voffset);
1409 WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_X, gridw_v);
1410 WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Y, gridh_v);
1411 WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Z, 1);
1412
1413 WRITE_PATCH(PATCH_H_QMD_PROGRAM_OFFSET,
1414 g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog]);
1415 WRITE_PATCH(PATCH_H_QMD_REGISTER_COUNT,
1416 g->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog]);
1417 WRITE_PATCH(PATCH_V_QMD_PROGRAM_OFFSET,
1418 g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog]);
1419 WRITE_PATCH(PATCH_V_QMD_REGISTER_COUNT,
1420 g->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
1421
1422 if (consumer & NVGPU_GPU_COMPBITS_CDEH) {
1423 WRITE_PATCH(PATCH_H_LAUNCH_WORD1,
1424 g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
1425 WRITE_PATCH(PATCH_H_LAUNCH_WORD2,
1426 g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
1427 } else {
1428 WRITE_PATCH(PATCH_H_LAUNCH_WORD1,
1429 g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
1430 WRITE_PATCH(PATCH_H_LAUNCH_WORD2,
1431 g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
1432 }
1433
1434 if (consumer & NVGPU_GPU_COMPBITS_CDEV) {
1435 WRITE_PATCH(PATCH_V_LAUNCH_WORD1,
1436 g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
1437 WRITE_PATCH(PATCH_V_LAUNCH_WORD2,
1438 g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
1439 } else {
1440 WRITE_PATCH(PATCH_V_LAUNCH_WORD1,
1441 g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
1442 WRITE_PATCH(PATCH_V_LAUNCH_WORD2,
1443 g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
1444 }
1297#undef WRITE_PATCH 1445#undef WRITE_PATCH
1298 1446
1299 err = gk20a_busy(g->dev);
1300 if (err)
1301 return err;
1302 err = gk20a_cde_convert(g, dmabuf, 1447 err = gk20a_cde_convert(g, dmabuf,
1303 0, /* dst kind */ 1448 0, /* dst kind */
1304 compbits_offset, 1449 compbits_hoffset,
1305 0, /* dst_size, 0 = auto */ 1450 0, /* dst_size, 0 = auto */
1306 fence_in, submit_flags, 1451 fence_in, submit_flags,
1307 params, param, fence_out); 1452 params, param, &new_fence);
1453 if (err)
1454 goto out;
1455
1456 /* compbits generated, update state & fence */
1457 gk20a_fence_put(state->fence);
1458 state->fence = new_fence;
1459 state->valid_compbits |= consumer &
1460 (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV);
1461out:
1462 return err;
1463}
1464
1465static int gk20a_buffer_convert_gpu_to_cde(
1466 struct gk20a *g, struct dma_buf *dmabuf, u32 consumer,
1467 u64 offset, u64 compbits_hoffset, u64 compbits_voffset,
1468 u32 width, u32 height, u32 block_height_log2,
1469 u32 submit_flags, struct nvgpu_fence *fence_in,
1470 struct gk20a_buffer_state *state)
1471{
1472 int err = 0;
1473
1474 if (!g->cde_app.initialised)
1475 return -ENOSYS;
1476
1477 err = gk20a_busy(g->dev);
1478 if (err)
1479 return err;
1480
1481 gk20a_dbg(gpu_dbg_cde, "firmware version = %d\n",
1482 g->cde_app.firmware_version);
1483
1484 if (g->cde_app.firmware_version == 0) {
1485 err = gk20a_buffer_convert_gpu_to_cde_v0(
1486 g, dmabuf, consumer, offset, compbits_hoffset,
1487 compbits_voffset, width, height, block_height_log2,
1488 submit_flags, fence_in, state);
1489 } else {
1490 err = gk20a_buffer_convert_gpu_to_cde_v1(
1491 g, dmabuf, consumer, offset, compbits_hoffset,
1492 compbits_voffset, width, height, block_height_log2,
1493 submit_flags, fence_in, state);
1494 }
1495
1308 gk20a_idle(g->dev); 1496 gk20a_idle(g->dev);
1309 return err; 1497 return err;
1310} 1498}
@@ -1326,7 +1514,8 @@ int gk20a_prepare_compressible_read(
1326 if (IS_ERR(dmabuf)) 1514 if (IS_ERR(dmabuf))
1327 return -EINVAL; 1515 return -EINVAL;
1328 1516
1329 err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g), offset, &state); 1517 err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g),
1518 offset, &state);
1330 if (err) { 1519 if (err) {
1331 dma_buf_put(dmabuf); 1520 dma_buf_put(dmabuf);
1332 return err; 1521 return err;
@@ -1345,40 +1534,20 @@ int gk20a_prepare_compressible_read(
1345 err = -EINVAL; 1534 err = -EINVAL;
1346 goto out; 1535 goto out;
1347 } else if (missing_bits) { 1536 } else if (missing_bits) {
1348 struct gk20a_fence *new_fence = NULL; 1537 u32 missing_cde_bits = missing_bits &
1538 (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV);
1349 if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) && 1539 if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) &&
1350 (missing_bits & NVGPU_GPU_COMPBITS_CDEH)) { 1540 missing_cde_bits) {
1351 err = gk20a_buffer_convert_gpu_to_cde( 1541 err = gk20a_buffer_convert_gpu_to_cde(
1352 g, dmabuf, 1542 g, dmabuf,
1353 NVGPU_GPU_COMPBITS_CDEH, 1543 missing_cde_bits,
1354 offset, compbits_hoffset, 1544 offset, compbits_hoffset,
1545 compbits_voffset,
1355 width, height, block_height_log2, 1546 width, height, block_height_log2,
1356 submit_flags, fence, 1547 submit_flags, fence,
1357 &new_fence); 1548 state);
1358 if (err) 1549 if (err)
1359 goto out; 1550 goto out;
1360
1361 /* CDEH bits generated, update state & fence */
1362 gk20a_fence_put(state->fence);
1363 state->fence = new_fence;
1364 state->valid_compbits |= NVGPU_GPU_COMPBITS_CDEH;
1365 }
1366 if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) &&
1367 (missing_bits & NVGPU_GPU_COMPBITS_CDEV)) {
1368 err = gk20a_buffer_convert_gpu_to_cde(
1369 g, dmabuf,
1370 NVGPU_GPU_COMPBITS_CDEV,
1371 offset, compbits_voffset,
1372 width, height, block_height_log2,
1373 submit_flags, fence,
1374 &new_fence);
1375 if (err)
1376 goto out;
1377
1378 /* CDEH bits generated, update state & fence */
1379 gk20a_fence_put(state->fence);
1380 state->fence = new_fence;
1381 state->valid_compbits |= NVGPU_GPU_COMPBITS_CDEV;
1382 } 1551 }
1383 } 1552 }
1384 1553
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
index 3347490c..b160162c 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
@@ -23,8 +23,9 @@
23 23
24#define MAX_CDE_BUFS 10 24#define MAX_CDE_BUFS 10
25#define MAX_CDE_PARAMS 64 25#define MAX_CDE_PARAMS 64
26#define MAX_CDE_USER_PARAMS 32 26#define MAX_CDE_USER_PARAMS 40
27#define MAX_CDE_OBJ_IDS 4 27#define MAX_CDE_OBJ_IDS 4
28#define MAX_CDE_ARRAY_ENTRIES 9
28 29
29/* 30/*
30 * The size of the context ring buffer that is dedicated for handling cde 31 * The size of the context ring buffer that is dedicated for handling cde
@@ -162,6 +163,22 @@ struct gk20a_cde_cmd_elem {
162}; 163};
163 164
164/* 165/*
166 * This element is used for storing a small array of data.
167 */
168
169enum {
170 ARRAY_PROGRAM_OFFSET = 0,
171 ARRAY_REGISTER_COUNT,
172 ARRAY_LAUNCH_COMMAND,
173 NUM_CDE_ARRAYS
174};
175
176struct gk20a_cde_hdr_array {
177 u32 id;
178 u32 data[MAX_CDE_ARRAY_ENTRIES];
179};
180
181/*
165 * Following defines a single header element. Each element has a type and 182 * Following defines a single header element. Each element has a type and
166 * some of the data structures. 183 * some of the data structures.
167 */ 184 */
@@ -175,6 +192,7 @@ struct gk20a_cde_hdr_elem {
175 struct gk20a_cde_hdr_param param; 192 struct gk20a_cde_hdr_param param;
176 u32 required_class; 193 u32 required_class;
177 struct gk20a_cde_hdr_command command; 194 struct gk20a_cde_hdr_command command;
195 struct gk20a_cde_hdr_array array;
178 }; 196 };
179}; 197};
180 198
@@ -183,7 +201,8 @@ enum {
183 TYPE_REPLACE, 201 TYPE_REPLACE,
184 TYPE_PARAM, 202 TYPE_PARAM,
185 TYPE_REQUIRED_CLASS, 203 TYPE_REQUIRED_CLASS,
186 TYPE_COMMAND 204 TYPE_COMMAND,
205 TYPE_ARRAY
187}; 206};
188 207
189struct gk20a_cde_mem_desc { 208struct gk20a_cde_mem_desc {
@@ -219,14 +238,12 @@ struct gk20a_cde_ctx {
219 /* storage for user space parameter values */ 238 /* storage for user space parameter values */
220 u32 user_param_values[MAX_CDE_USER_PARAMS]; 239 u32 user_param_values[MAX_CDE_USER_PARAMS];
221 240
222 u64 src_smmu_addr; 241 u32 surf_param_offset;
223 u32 src_param_offset; 242 u32 surf_param_lines;
224 u32 src_param_lines; 243 u64 surf_vaddr;
225 244
226 u64 src_vaddr; 245 u64 compbit_vaddr;
227 246 u64 compbit_size;
228 u64 dest_vaddr;
229 u64 dest_size;
230 247
231 u32 obj_ids[MAX_CDE_OBJ_IDS]; 248 u32 obj_ids[MAX_CDE_OBJ_IDS];
232 int num_obj_ids; 249 int num_obj_ids;
@@ -259,6 +276,10 @@ struct gk20a_cde_app {
259 int ctx_usecount; 276 int ctx_usecount;
260 int ctx_count_top; 277 int ctx_count_top;
261 278
279 u32 firmware_version;
280
281 u32 arrays[NUM_CDE_ARRAYS][MAX_CDE_ARRAY_ENTRIES];
282
262 u32 shader_parameter; 283 u32 shader_parameter;
263}; 284};
264 285
@@ -266,9 +287,9 @@ void gk20a_cde_destroy(struct gk20a *g);
266void gk20a_cde_suspend(struct gk20a *g); 287void gk20a_cde_suspend(struct gk20a *g);
267int gk20a_init_cde_support(struct gk20a *g); 288int gk20a_init_cde_support(struct gk20a *g);
268int gk20a_cde_reload(struct gk20a *g); 289int gk20a_cde_reload(struct gk20a *g);
269int gk20a_cde_convert(struct gk20a *g, struct dma_buf *dst, 290int gk20a_cde_convert(struct gk20a *g, struct dma_buf *compbits_buf,
270 s32 dst_kind, u64 dst_word_offset, 291 s32 compbits_kind, u64 compbits_word_offset,
271 u32 dst_size, struct nvgpu_fence *fence, 292 u32 compbits_size, struct nvgpu_fence *fence,
272 u32 __flags, struct gk20a_cde_param *params, 293 u32 __flags, struct gk20a_cde_param *params,
273 int num_params, struct gk20a_fence **fence_out); 294 int num_params, struct gk20a_fence **fence_out);
274void gk20a_cde_debugfs_init(struct platform_device *dev); 295void gk20a_cde_debugfs_init(struct platform_device *dev);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index a390e36b..08dd41c5 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1546,7 +1546,7 @@ u64 gk20a_gmmu_map(struct vm_gk20a *vm,
1546 return vaddr; 1546 return vaddr;
1547} 1547}
1548 1548
1549dma_addr_t gk20a_mm_gpuva_to_iova(struct vm_gk20a *vm, u64 gpu_vaddr) 1549dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr)
1550{ 1550{
1551 struct mapped_buffer_node *buffer; 1551 struct mapped_buffer_node *buffer;
1552 dma_addr_t addr = 0; 1552 dma_addr_t addr = 0;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 3f7042ee..efed79f8 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -530,7 +530,7 @@ int gk20a_vm_map_buffer(struct gk20a_as_share *as_share,
530int gk20a_vm_unmap_buffer(struct gk20a_as_share *, u64 offset); 530int gk20a_vm_unmap_buffer(struct gk20a_as_share *, u64 offset);
531void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf, 531void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf,
532 struct gk20a_comptags *comptags); 532 struct gk20a_comptags *comptags);
533dma_addr_t gk20a_mm_gpuva_to_iova(struct vm_gk20a *vm, u64 gpu_vaddr); 533dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr);
534 534
535int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev); 535int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev);
536 536