diff options
Diffstat (limited to 'drivers/gpu')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/cde_gk20a.c | 541 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/cde_gk20a.h | 45 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 2 |
4 files changed, 390 insertions, 200 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c index 472cc81c..8b2ed55e 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c | |||
@@ -337,8 +337,8 @@ static int gk20a_replace_data(struct gk20a_cde_ctx *cde_ctx, void *target, | |||
337 | } | 337 | } |
338 | 338 | ||
339 | static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx, | 339 | static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx, |
340 | const struct firmware *img, | 340 | const struct firmware *img, |
341 | struct gk20a_cde_hdr_replace *replace) | 341 | struct gk20a_cde_hdr_replace *replace) |
342 | { | 342 | { |
343 | struct gk20a_cde_mem_desc *source_mem; | 343 | struct gk20a_cde_mem_desc *source_mem; |
344 | struct gk20a_cde_mem_desc *target_mem; | 344 | struct gk20a_cde_mem_desc *target_mem; |
@@ -410,26 +410,26 @@ static int gk20a_cde_patch_params(struct gk20a_cde_ctx *cde_ctx) | |||
410 | g->gr.cacheline_size; | 410 | g->gr.cacheline_size; |
411 | break; | 411 | break; |
412 | case TYPE_PARAM_FIRSTPAGEOFFSET: | 412 | case TYPE_PARAM_FIRSTPAGEOFFSET: |
413 | new_data = cde_ctx->src_param_offset; | 413 | new_data = cde_ctx->surf_param_offset; |
414 | break; | 414 | break; |
415 | case TYPE_PARAM_NUMPAGES: | 415 | case TYPE_PARAM_NUMPAGES: |
416 | new_data = cde_ctx->src_param_lines; | 416 | new_data = cde_ctx->surf_param_lines; |
417 | break; | 417 | break; |
418 | case TYPE_PARAM_BACKINGSTORE: | 418 | case TYPE_PARAM_BACKINGSTORE: |
419 | new_data = cde_ctx->backing_store_vaddr; | 419 | new_data = cde_ctx->backing_store_vaddr; |
420 | break; | 420 | break; |
421 | case TYPE_PARAM_DESTINATION: | 421 | case TYPE_PARAM_DESTINATION: |
422 | new_data = cde_ctx->dest_vaddr; | 422 | new_data = cde_ctx->compbit_vaddr; |
423 | break; | 423 | break; |
424 | case TYPE_PARAM_DESTINATION_SIZE: | 424 | case TYPE_PARAM_DESTINATION_SIZE: |
425 | new_data = cde_ctx->dest_size; | 425 | new_data = cde_ctx->compbit_size; |
426 | break; | 426 | break; |
427 | case TYPE_PARAM_BACKINGSTORE_SIZE: | 427 | case TYPE_PARAM_BACKINGSTORE_SIZE: |
428 | new_data = g->gr.compbit_store.size; | 428 | new_data = g->gr.compbit_store.size; |
429 | break; | 429 | break; |
430 | case TYPE_PARAM_SOURCE_SMMU_ADDR: | 430 | case TYPE_PARAM_SOURCE_SMMU_ADDR: |
431 | new_data = gk20a_mm_gpuva_to_iova(cde_ctx->vm, | 431 | new_data = gk20a_mm_gpuva_to_iova_base(cde_ctx->vm, |
432 | cde_ctx->src_vaddr); | 432 | cde_ctx->surf_vaddr); |
433 | if (new_data == 0) | 433 | if (new_data == 0) |
434 | err = -EINVAL; | 434 | err = -EINVAL; |
435 | break; | 435 | break; |
@@ -605,8 +605,9 @@ static int gk20a_init_cde_command(struct gk20a_cde_ctx *cde_ctx, | |||
605 | static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx, | 605 | static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx, |
606 | const struct firmware *img) | 606 | const struct firmware *img) |
607 | { | 607 | { |
608 | struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app; | ||
608 | u32 *data = (u32 *)img->data; | 609 | u32 *data = (u32 *)img->data; |
609 | u32 version, num_of_elems; | 610 | u32 num_of_elems; |
610 | struct gk20a_cde_hdr_elem *elem; | 611 | struct gk20a_cde_hdr_elem *elem; |
611 | u32 min_size = 0; | 612 | u32 min_size = 0; |
612 | int err = 0; | 613 | int err = 0; |
@@ -618,7 +619,7 @@ static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx, | |||
618 | return -EINVAL; | 619 | return -EINVAL; |
619 | } | 620 | } |
620 | 621 | ||
621 | version = data[0]; | 622 | cde_app->firmware_version = data[0]; |
622 | num_of_elems = data[1]; | 623 | num_of_elems = data[1]; |
623 | 624 | ||
624 | min_size += num_of_elems * sizeof(*elem); | 625 | min_size += num_of_elems * sizeof(*elem); |
@@ -654,6 +655,11 @@ static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx, | |||
654 | elem->command.num_entries); | 655 | elem->command.num_entries); |
655 | break; | 656 | break; |
656 | } | 657 | } |
658 | case TYPE_ARRAY: | ||
659 | memcpy(&cde_app->arrays[elem->array.id][0], | ||
660 | elem->array.data, | ||
661 | MAX_CDE_ARRAY_ENTRIES*sizeof(u32)); | ||
662 | break; | ||
657 | default: | 663 | default: |
658 | gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown header element"); | 664 | gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown header element"); |
659 | err = -EINVAL; | 665 | err = -EINVAL; |
@@ -853,27 +859,25 @@ static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g) | |||
853 | } | 859 | } |
854 | 860 | ||
855 | int gk20a_cde_convert(struct gk20a *g, | 861 | int gk20a_cde_convert(struct gk20a *g, |
856 | struct dma_buf *dst, | 862 | struct dma_buf *compbits_buf, |
857 | s32 dst_kind, u64 dst_byte_offset, | 863 | s32 compbits_kind, u64 compbits_byte_offset, |
858 | u32 dst_size, struct nvgpu_fence *fence, | 864 | u32 compbits_size, struct nvgpu_fence *fence, |
859 | u32 __flags, struct gk20a_cde_param *params, | 865 | u32 __flags, struct gk20a_cde_param *params, |
860 | int num_params, struct gk20a_fence **fence_out) | 866 | int num_params, struct gk20a_fence **fence_out) |
861 | __acquires(&cde_app->mutex) | 867 | __acquires(&cde_app->mutex) |
862 | __releases(&cde_app->mutex) | 868 | __releases(&cde_app->mutex) |
863 | { | 869 | { |
864 | struct gk20a_cde_app *cde_app = &g->cde_app; | 870 | struct gk20a_cde_ctx *cde_ctx = NULL; |
865 | struct gk20a_comptags comptags; | 871 | struct gk20a_comptags comptags; |
866 | struct gk20a_cde_ctx *cde_ctx; | 872 | u64 compbits_offset = 0; |
867 | u64 dst_vaddr = 0; | 873 | u64 map_vaddr = 0; |
874 | u64 map_offset = 0; | ||
875 | u32 map_size = 0; | ||
876 | u64 big_page_mask = 0; | ||
868 | u32 flags; | 877 | u32 flags; |
869 | int err, i; | 878 | int err, i; |
870 | 879 | ||
871 | if (!cde_app->initialised) { | 880 | mutex_lock(&g->cde_app.mutex); |
872 | gk20a_warn(&g->dev->dev, "cde: conversion requrest but no image has been provided"); | ||
873 | return -ENOSYS; | ||
874 | } | ||
875 | |||
876 | mutex_lock(&cde_app->mutex); | ||
877 | 881 | ||
878 | cde_ctx = gk20a_cde_get_context(g); | 882 | cde_ctx = gk20a_cde_get_context(g); |
879 | if (IS_ERR(cde_ctx)) { | 883 | if (IS_ERR(cde_ctx)) { |
@@ -881,38 +885,53 @@ __releases(&cde_app->mutex) | |||
881 | goto exit_unlock; | 885 | goto exit_unlock; |
882 | } | 886 | } |
883 | 887 | ||
884 | /* First, map the buffers to local va */ | 888 | /* First, map the buffer to local va */ |
885 | 889 | ||
886 | /* ensure that the dst buffer has drvdata */ | 890 | /* ensure that the compbits buffer has drvdata */ |
887 | err = gk20a_dmabuf_alloc_drvdata(dst, &g->dev->dev); | 891 | err = gk20a_dmabuf_alloc_drvdata(compbits_buf, &g->dev->dev); |
888 | if (err) | 892 | if (err) |
889 | goto exit_unlock; | 893 | goto exit_unlock; |
890 | 894 | ||
895 | /* compbits don't start at page aligned offset, so we need to align | ||
896 | the region to be mapped */ | ||
897 | big_page_mask = cde_ctx->vm->big_page_size - 1; | ||
898 | map_offset = compbits_byte_offset & ~big_page_mask; | ||
899 | |||
900 | /* compute compbit start offset from the beginning of the mapped | ||
901 | area */ | ||
902 | compbits_offset = compbits_byte_offset & big_page_mask; | ||
903 | |||
904 | if (!compbits_size) { | ||
905 | compbits_size = compbits_buf->size - compbits_byte_offset; | ||
906 | map_size = compbits_buf->size - map_offset; | ||
907 | } | ||
908 | |||
891 | /* map the destination buffer */ | 909 | /* map the destination buffer */ |
892 | get_dma_buf(dst); /* a ref for gk20a_vm_map */ | 910 | get_dma_buf(compbits_buf); /* a ref for gk20a_vm_map */ |
893 | dst_vaddr = gk20a_vm_map(cde_ctx->vm, dst, 0, | 911 | map_vaddr = gk20a_vm_map(cde_ctx->vm, compbits_buf, 0, |
894 | NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, | 912 | NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, |
895 | dst_kind, NULL, true, | 913 | compbits_kind, NULL, true, |
896 | gk20a_mem_flag_none, | 914 | gk20a_mem_flag_none, |
897 | 0, 0); | 915 | map_offset, map_size); |
898 | if (!dst_vaddr) { | 916 | if (!map_vaddr) { |
899 | dma_buf_put(dst); | 917 | dma_buf_put(compbits_buf); |
900 | err = -EINVAL; | 918 | err = -EINVAL; |
901 | goto exit_unlock; | 919 | goto exit_unlock; |
902 | } | 920 | } |
903 | 921 | ||
904 | if (!dst_size) | ||
905 | dst_size = dst->size - dst_byte_offset; | ||
906 | |||
907 | /* store source buffer compression tags */ | 922 | /* store source buffer compression tags */ |
908 | gk20a_get_comptags(&g->dev->dev, dst, &comptags); | 923 | gk20a_get_comptags(&g->dev->dev, compbits_buf, &comptags); |
909 | cde_ctx->src_vaddr = dst_vaddr; | 924 | cde_ctx->surf_param_offset = comptags.offset; |
910 | cde_ctx->src_param_offset = comptags.offset; | 925 | cde_ctx->surf_param_lines = comptags.lines; |
911 | cde_ctx->src_param_lines = comptags.lines; | 926 | |
927 | /* store surface vaddr. This is actually compbit vaddr, but since | ||
928 | compbits live in the same surface, and we can get the alloc base | ||
929 | address by using gk20a_mm_gpuva_to_iova_base, this will do */ | ||
930 | cde_ctx->surf_vaddr = map_vaddr; | ||
912 | 931 | ||
913 | /* store information about destination */ | 932 | /* store information about destination */ |
914 | cde_ctx->dest_vaddr = dst_vaddr + dst_byte_offset; | 933 | cde_ctx->compbit_vaddr = map_vaddr + compbits_offset; |
915 | cde_ctx->dest_size = dst_size; | 934 | cde_ctx->compbit_size = compbits_size; |
916 | 935 | ||
917 | /* remove existing argument data */ | 936 | /* remove existing argument data */ |
918 | memset(cde_ctx->user_param_values, 0, | 937 | memset(cde_ctx->user_param_values, 0, |
@@ -940,8 +959,8 @@ __releases(&cde_app->mutex) | |||
940 | 959 | ||
941 | gk20a_dbg(gpu_dbg_cde, "cde: buffer=cbc, size=%zu, gpuva=%llx\n", | 960 | gk20a_dbg(gpu_dbg_cde, "cde: buffer=cbc, size=%zu, gpuva=%llx\n", |
942 | g->gr.compbit_store.size, cde_ctx->backing_store_vaddr); | 961 | g->gr.compbit_store.size, cde_ctx->backing_store_vaddr); |
943 | gk20a_dbg(gpu_dbg_cde, "cde: buffer=dst, size=%llu, gpuva=%llx\n", | 962 | gk20a_dbg(gpu_dbg_cde, "cde: buffer=compbits, size=%llu, gpuva=%llx\n", |
944 | cde_ctx->dest_size, cde_ctx->dest_vaddr); | 963 | cde_ctx->compbit_size, cde_ctx->compbit_vaddr); |
945 | 964 | ||
946 | /* execute the init push buffer */ | 965 | /* execute the init push buffer */ |
947 | if (!cde_ctx->init_cmd_executed) { | 966 | if (!cde_ctx->init_cmd_executed) { |
@@ -964,11 +983,10 @@ __releases(&cde_app->mutex) | |||
964 | exit_unlock: | 983 | exit_unlock: |
965 | 984 | ||
966 | /* unmap the buffers - channel holds references to them now */ | 985 | /* unmap the buffers - channel holds references to them now */ |
967 | if (dst_vaddr) | 986 | if (map_vaddr) |
968 | gk20a_vm_unmap(cde_ctx->vm, dst_vaddr); | 987 | gk20a_vm_unmap(cde_ctx->vm, map_vaddr); |
969 | |||
970 | mutex_unlock(&cde_app->mutex); | ||
971 | 988 | ||
989 | mutex_unlock(&g->cde_app.mutex); | ||
972 | return err; | 990 | return err; |
973 | } | 991 | } |
974 | 992 | ||
@@ -1159,152 +1177,322 @@ __releases(&cde_app->mutex) | |||
1159 | return err; | 1177 | return err; |
1160 | } | 1178 | } |
1161 | 1179 | ||
1162 | enum cde_launch_patch_offset { | ||
1163 | /* dst buffer width in roptiles */ | ||
1164 | PATCH_USER_CONST_XTILES, | ||
1165 | /* dst buffer height in roptiles */ | ||
1166 | PATCH_USER_CONST_YTILES, | ||
1167 | /* dst buffer log2(block height) */ | ||
1168 | PATCH_USER_CONST_BLOCKHEIGHTLOG2, | ||
1169 | /* dst buffer pitch in bytes */ | ||
1170 | PATCH_USER_CONST_DSTPITCH, | ||
1171 | /* dst buffer write offset */ | ||
1172 | PATCH_USER_CONST_DSTOFFSET, | ||
1173 | /* comp cache index of the first page of the surface, | ||
1174 | * kernel looks it up from PTE */ | ||
1175 | PATCH_USER_CONST_FIRSTPAGEOFFSET, | ||
1176 | /* gmmu translated surface address, kernel fills */ | ||
1177 | PATCH_USER_CONST_SURFADDR, | ||
1178 | /* dst buffer address >> 8, kernel fills */ | ||
1179 | PATCH_VPC_DSTIMAGE_ADDR, | ||
1180 | /* dst buffer address >> 8, kernel fills */ | ||
1181 | PATCH_VPC_DSTIMAGE_ADDR2, | ||
1182 | /* dst buffer size - 1, kernel fills */ | ||
1183 | PATCH_VPC_DSTIMAGE_SIZE_MINUS_ONE, | ||
1184 | /* dst buffer size - 1, kernel fills */ | ||
1185 | PATCH_VPC_DSTIMAGE_SIZE_MINUS_ONE2, | ||
1186 | /* dst buffer size, kernel fills */ | ||
1187 | PATCH_VPC_DSTIMAGE_SIZE, | ||
1188 | /* dst buffer width in roptiles / work group width */ | ||
1189 | PATCH_VPC_CURRENT_GRID_SIZE_X, | ||
1190 | /* dst buffer height in roptiles / work group height */ | ||
1191 | PATCH_VPC_CURRENT_GRID_SIZE_Y, | ||
1192 | /* 1 */ | ||
1193 | PATCH_VPC_CURRENT_GRID_SIZE_Z, | ||
1194 | /* work group width, 16 seems to be quite optimal */ | ||
1195 | PATCH_VPC_CURRENT_GROUP_SIZE_X, | ||
1196 | /* work group height, 8 seems to be quite optimal */ | ||
1197 | PATCH_VPC_CURRENT_GROUP_SIZE_Y, | ||
1198 | /* 1 */ | ||
1199 | PATCH_VPC_CURRENT_GROUP_SIZE_Z, | ||
1200 | /* same as PATCH_VPC_CURRENT_GRID_SIZE_X */ | ||
1201 | PATCH_QMD_CTA_RASTER_WIDTH, | ||
1202 | /* same as PATCH_VPC_CURRENT_GRID_SIZE_Y */ | ||
1203 | PATCH_QMD_CTA_RASTER_HEIGHT, | ||
1204 | /* same as PATCH_VPC_CURRENT_GRID_SIZE_Z */ | ||
1205 | PATCH_QMD_CTA_RASTER_DEPTH, | ||
1206 | /* same as PATCH_VPC_CURRENT_GROUP_SIZE_X */ | ||
1207 | PATCH_QMD_CTA_THREAD_DIMENSION0, | ||
1208 | /* same as PATCH_VPC_CURRENT_GROUP_SIZE_Y */ | ||
1209 | PATCH_QMD_CTA_THREAD_DIMENSION1, | ||
1210 | /* same as PATCH_VPC_CURRENT_GROUP_SIZE_Z */ | ||
1211 | PATCH_QMD_CTA_THREAD_DIMENSION2, | ||
1212 | |||
1213 | NUM_CDE_LAUNCH_PATCHES | ||
1214 | }; | ||
1215 | |||
1216 | enum cde_launch_patch_id { | 1180 | enum cde_launch_patch_id { |
1217 | PATCH_QMD_CTA_RASTER_WIDTH_ID = 1024, | 1181 | PATCH_H_QMD_CTA_RASTER_WIDTH_ID = 1024, |
1218 | PATCH_QMD_CTA_RASTER_HEIGHT_ID = 1025, | 1182 | PATCH_H_QMD_CTA_RASTER_HEIGHT_ID = 1025, |
1219 | PATCH_QMD_CTA_RASTER_DEPTH_ID = 1026, | 1183 | PATCH_QMD_CTA_RASTER_DEPTH_ID = 1026, /* for firmware v0 only */ |
1220 | PATCH_QMD_CTA_THREAD_DIMENSION0_ID = 1027, | 1184 | PATCH_QMD_CTA_THREAD_DIMENSION0_ID = 1027, |
1221 | PATCH_QMD_CTA_THREAD_DIMENSION1_ID = 1028, | 1185 | PATCH_QMD_CTA_THREAD_DIMENSION1_ID = 1028, |
1222 | PATCH_QMD_CTA_THREAD_DIMENSION2_ID = 1029, | 1186 | PATCH_QMD_CTA_THREAD_DIMENSION2_ID = 1029, /* for firmware v0 only */ |
1223 | PATCH_USER_CONST_XTILES_ID = 1030, | 1187 | PATCH_USER_CONST_XTILES_ID = 1030, /* for firmware v0 only */ |
1224 | PATCH_USER_CONST_YTILES_ID = 1031, | 1188 | PATCH_USER_CONST_YTILES_ID = 1031, /* for firmware v0 only */ |
1225 | PATCH_USER_CONST_BLOCKHEIGHTLOG2_ID = 1032, | 1189 | PATCH_USER_CONST_BLOCKHEIGHTLOG2_ID = 1032, |
1226 | PATCH_USER_CONST_DSTPITCH_ID = 1033, | 1190 | PATCH_USER_CONST_DSTPITCH_ID = 1033, /* for firmware v0 only */ |
1227 | PATCH_USER_CONST_DSTOFFSET_ID = 1034, | 1191 | PATCH_H_USER_CONST_FLAGS_ID = 1034, /* for firmware v0 only */ |
1228 | PATCH_VPC_CURRENT_GRID_SIZE_X_ID = 1035, | 1192 | PATCH_H_VPC_CURRENT_GRID_SIZE_X_ID = 1035, |
1229 | PATCH_VPC_CURRENT_GRID_SIZE_Y_ID = 1036, | 1193 | PATCH_H_VPC_CURRENT_GRID_SIZE_Y_ID = 1036, |
1230 | PATCH_VPC_CURRENT_GRID_SIZE_Z_ID = 1037, | 1194 | PATCH_H_VPC_CURRENT_GRID_SIZE_Z_ID = 1037, |
1231 | PATCH_VPC_CURRENT_GROUP_SIZE_X_ID = 1038, | 1195 | PATCH_VPC_CURRENT_GROUP_SIZE_X_ID = 1038, |
1232 | PATCH_VPC_CURRENT_GROUP_SIZE_Y_ID = 1039, | 1196 | PATCH_VPC_CURRENT_GROUP_SIZE_Y_ID = 1039, |
1233 | PATCH_VPC_CURRENT_GROUP_SIZE_Z_ID = 1040, | 1197 | PATCH_VPC_CURRENT_GROUP_SIZE_Z_ID = 1040, |
1198 | PATCH_USER_CONST_XBLOCKS_ID = 1041, | ||
1199 | PATCH_H_USER_CONST_DSTOFFSET_ID = 1042, | ||
1200 | PATCH_V_QMD_CTA_RASTER_WIDTH_ID = 1043, | ||
1201 | PATCH_V_QMD_CTA_RASTER_HEIGHT_ID = 1044, | ||
1202 | PATCH_V_USER_CONST_DSTOFFSET_ID = 1045, | ||
1203 | PATCH_V_VPC_CURRENT_GRID_SIZE_X_ID = 1046, | ||
1204 | PATCH_V_VPC_CURRENT_GRID_SIZE_Y_ID = 1047, | ||
1205 | PATCH_V_VPC_CURRENT_GRID_SIZE_Z_ID = 1048, | ||
1206 | PATCH_H_LAUNCH_WORD1_ID = 1049, | ||
1207 | PATCH_H_LAUNCH_WORD2_ID = 1050, | ||
1208 | PATCH_V_LAUNCH_WORD1_ID = 1051, | ||
1209 | PATCH_V_LAUNCH_WORD2_ID = 1052, | ||
1210 | PATCH_H_QMD_PROGRAM_OFFSET_ID = 1053, | ||
1211 | PATCH_H_QMD_REGISTER_COUNT_ID = 1054, | ||
1212 | PATCH_V_QMD_PROGRAM_OFFSET_ID = 1055, | ||
1213 | PATCH_V_QMD_REGISTER_COUNT_ID = 1056, | ||
1234 | }; | 1214 | }; |
1235 | 1215 | ||
1236 | static int gk20a_buffer_convert_gpu_to_cde( | 1216 | enum programs { |
1237 | struct gk20a *g, struct dma_buf *dmabuf, u32 consumer, | 1217 | PROG_HPASS = 0, |
1238 | u64 offset, u64 compbits_offset, | 1218 | PROG_VPASS_LARGE = 1, |
1219 | PROG_VPASS_SMALL = 2, | ||
1220 | PROG_HPASS_DEBUG = 3, | ||
1221 | PROG_VPASS_LARGE_DEBUG = 4, | ||
1222 | PROG_VPASS_SMALL_DEBUG = 5, | ||
1223 | PROG_PASSTHROUGH = 6, | ||
1224 | NUM_PROGRAMS = 7 | ||
1225 | }; | ||
1226 | |||
1227 | /* maximum number of WRITE_PATCHes in the below function */ | ||
1228 | #define MAX_CDE_LAUNCH_PATCHES 32 | ||
1229 | |||
1230 | static int gk20a_buffer_convert_gpu_to_cde_v0( | ||
1231 | struct gk20a *g, | ||
1232 | struct dma_buf *dmabuf, u32 consumer, | ||
1233 | u64 offset, u64 compbits_hoffset, u64 compbits_voffset, | ||
1239 | u32 width, u32 height, u32 block_height_log2, | 1234 | u32 width, u32 height, u32 block_height_log2, |
1240 | u32 submit_flags, struct nvgpu_fence *fence_in, | 1235 | u32 submit_flags, struct nvgpu_fence *fence_in, |
1241 | struct gk20a_fence **fence_out) | 1236 | struct gk20a_buffer_state *state) |
1242 | { | 1237 | { |
1243 | struct gk20a_cde_param params[NUM_CDE_LAUNCH_PATCHES]; | 1238 | struct gk20a_cde_param params[MAX_CDE_LAUNCH_PATCHES]; |
1244 | int param = 0; | 1239 | int param = 0; |
1245 | int err = 0; | 1240 | int err = 0; |
1241 | struct gk20a_fence *new_fence = NULL; | ||
1242 | const int wgx = 8; | ||
1243 | const int wgy = 8; | ||
1244 | const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ | ||
1245 | const int xalign = compbits_per_byte * wgx; | ||
1246 | const int yalign = wgy; | ||
1246 | 1247 | ||
1247 | /* Compute per launch parameters */ | 1248 | /* firmware v0 needs to call swizzling twice */ |
1248 | const bool transpose = (consumer == NVGPU_GPU_COMPBITS_CDEV); | 1249 | int i; |
1249 | const int transposed_width = transpose ? height : width; | 1250 | for (i = 0; i < 2; i++) { |
1250 | const int transposed_height = transpose ? width : height; | 1251 | /* Compute per launch parameters */ |
1251 | const int xtiles = (transposed_width + 7) >> 3; | 1252 | const bool vpass = (i == 1); |
1252 | const int ytiles = (transposed_height + 7) >> 3; | 1253 | const int transposed_width = vpass ? height : width; |
1254 | const int transposed_height = vpass ? width : height; | ||
1255 | const int xtiles = (transposed_width + 7) >> 3; | ||
1256 | const int ytiles = (transposed_height + 7) >> 3; | ||
1257 | const int gridw = roundup(xtiles, xalign) / xalign; | ||
1258 | const int gridh = roundup(ytiles, yalign) / yalign; | ||
1259 | const int flags = (vpass ? 4 : 0) | | ||
1260 | g->cde_app.shader_parameter; | ||
1261 | const int dst_stride = 128; /* chip constant */ | ||
1262 | |||
1263 | if ((vpass && !(consumer & NVGPU_GPU_COMPBITS_CDEV)) || | ||
1264 | (!vpass && !(consumer & NVGPU_GPU_COMPBITS_CDEH))) | ||
1265 | continue; | ||
1266 | |||
1267 | if (xtiles > 4096 / 8 || ytiles > 4096 / 8) | ||
1268 | gk20a_warn(&g->dev->dev, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)", | ||
1269 | xtiles, ytiles); | ||
1270 | |||
1271 | gk20a_dbg(gpu_dbg_cde, "pass=%c", vpass ? 'V' : 'H'); | ||
1272 | gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_hoffset=0x%llx, compbits_voffset=0x%llx", | ||
1273 | width, height, block_height_log2, | ||
1274 | compbits_hoffset, compbits_voffset); | ||
1275 | gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d)", | ||
1276 | width, height, xtiles, ytiles); | ||
1277 | gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)", | ||
1278 | wgx, wgy, gridw, gridh); | ||
1279 | |||
1280 | /* Write parameters */ | ||
1281 | #define WRITE_PATCH(NAME, VALUE) \ | ||
1282 | params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE} | ||
1283 | param = 0; | ||
1284 | WRITE_PATCH(PATCH_USER_CONST_XTILES, xtiles); | ||
1285 | WRITE_PATCH(PATCH_USER_CONST_YTILES, ytiles); | ||
1286 | WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2, | ||
1287 | block_height_log2); | ||
1288 | WRITE_PATCH(PATCH_USER_CONST_DSTPITCH, dst_stride); | ||
1289 | WRITE_PATCH(PATCH_H_USER_CONST_FLAGS, flags); | ||
1290 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_X, gridw); | ||
1291 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Y, gridh); | ||
1292 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Z, 1); | ||
1293 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx); | ||
1294 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy); | ||
1295 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1); | ||
1296 | WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_WIDTH, gridw); | ||
1297 | WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_HEIGHT, gridh); | ||
1298 | WRITE_PATCH(PATCH_QMD_CTA_RASTER_DEPTH, 1); | ||
1299 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx); | ||
1300 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy); | ||
1301 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION2, 1); | ||
1302 | #undef WRITE_PATCH | ||
1303 | |||
1304 | err = gk20a_cde_convert(g, dmabuf, | ||
1305 | 0, /* dst kind */ | ||
1306 | vpass ? | ||
1307 | compbits_voffset : | ||
1308 | compbits_hoffset, | ||
1309 | 0, /* dst_size, 0 = auto */ | ||
1310 | fence_in, submit_flags, | ||
1311 | params, param, | ||
1312 | &new_fence); | ||
1313 | if (err) | ||
1314 | goto out; | ||
1315 | |||
1316 | /* compbits generated, update state & fence */ | ||
1317 | gk20a_fence_put(state->fence); | ||
1318 | state->fence = new_fence; | ||
1319 | state->valid_compbits |= vpass ? | ||
1320 | NVGPU_GPU_COMPBITS_CDEV : | ||
1321 | NVGPU_GPU_COMPBITS_CDEH; | ||
1322 | } | ||
1323 | out: | ||
1324 | return err; | ||
1325 | } | ||
1326 | |||
1327 | static int gk20a_buffer_convert_gpu_to_cde_v1( | ||
1328 | struct gk20a *g, | ||
1329 | struct dma_buf *dmabuf, u32 consumer, | ||
1330 | u64 offset, u64 compbits_hoffset, u64 compbits_voffset, | ||
1331 | u32 width, u32 height, u32 block_height_log2, | ||
1332 | u32 submit_flags, struct nvgpu_fence *fence_in, | ||
1333 | struct gk20a_buffer_state *state) | ||
1334 | { | ||
1335 | struct gk20a_cde_param params[MAX_CDE_LAUNCH_PATCHES]; | ||
1336 | int param = 0; | ||
1337 | int err = 0; | ||
1338 | struct gk20a_fence *new_fence = NULL; | ||
1253 | const int wgx = 8; | 1339 | const int wgx = 8; |
1254 | const int wgy = 8; | 1340 | const int wgy = 8; |
1255 | const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ | 1341 | const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ |
1256 | const int dst_stride = 128; /* TODO chip constant */ | ||
1257 | const int xalign = compbits_per_byte * wgx; | 1342 | const int xalign = compbits_per_byte * wgx; |
1258 | const int yalign = wgy; | 1343 | const int yalign = wgy; |
1259 | const int gridw = roundup(xtiles, xalign) / xalign; | ||
1260 | const int gridh = roundup(ytiles, yalign) / yalign; | ||
1261 | 1344 | ||
1262 | if (!g->cde_app.initialised) | 1345 | /* Compute per launch parameters */ |
1263 | return -ENOSYS; | 1346 | const int xtiles = (width + 7) >> 3; |
1347 | const int ytiles = (height + 7) >> 3; | ||
1348 | const int gridw_h = roundup(xtiles, xalign) / xalign; | ||
1349 | const int gridh_h = roundup(ytiles, yalign) / yalign; | ||
1350 | const int gridw_v = roundup(ytiles, xalign) / xalign; | ||
1351 | const int gridh_v = roundup(xtiles, yalign) / yalign; | ||
1352 | const int xblocks = (xtiles + 1) >> 1; | ||
1353 | const int voffset = compbits_voffset - compbits_hoffset; | ||
1354 | |||
1355 | int hprog = PROG_HPASS; | ||
1356 | int vprog = (block_height_log2 >= 2) ? | ||
1357 | PROG_VPASS_LARGE : PROG_VPASS_SMALL; | ||
1358 | if (g->cde_app.shader_parameter == 1) { | ||
1359 | hprog = PROG_PASSTHROUGH; | ||
1360 | vprog = PROG_PASSTHROUGH; | ||
1361 | } else if (g->cde_app.shader_parameter == 2) { | ||
1362 | hprog = PROG_HPASS_DEBUG; | ||
1363 | vprog = (block_height_log2 >= 2) ? | ||
1364 | PROG_VPASS_LARGE_DEBUG : | ||
1365 | PROG_VPASS_SMALL_DEBUG; | ||
1366 | } | ||
1264 | 1367 | ||
1265 | if (xtiles > 4096 / 8 || ytiles > 4096 / 8) | 1368 | if (xtiles > 4096 / 8 || ytiles > 4096 / 8) |
1266 | gk20a_warn(&g->dev->dev, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)", | 1369 | gk20a_warn(&g->dev->dev, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)", |
1267 | xtiles, ytiles); | 1370 | xtiles, ytiles); |
1268 | 1371 | ||
1269 | gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx", | 1372 | gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_hoffset=0x%llx, compbits_voffset=0x%llx", |
1270 | width, height, block_height_log2, compbits_offset); | 1373 | width, height, block_height_log2, |
1271 | gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)", | 1374 | compbits_hoffset, compbits_voffset); |
1272 | width, height, xtiles, ytiles, gridw*wgx, gridh*wgy); | 1375 | gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d)", |
1273 | gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)", | 1376 | width, height, xtiles, ytiles); |
1274 | wgx, wgy, gridw, gridh); | 1377 | gk20a_dbg(gpu_dbg_cde, "group (%d, %d) gridH (%d, %d) gridV (%d, %d)", |
1378 | wgx, wgy, gridw_h, gridh_h, gridw_v, gridh_v); | ||
1379 | gk20a_dbg(gpu_dbg_cde, "hprog=%d, offset=0x%x, regs=%d, vprog=%d, offset=0x%x, regs=%d", | ||
1380 | hprog, | ||
1381 | g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog], | ||
1382 | g->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog], | ||
1383 | vprog, | ||
1384 | g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog], | ||
1385 | g->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]); | ||
1275 | 1386 | ||
1276 | /* Write parameters */ | 1387 | /* Write parameters */ |
1277 | #define WRITE_PATCH(NAME, VALUE) \ | 1388 | #define WRITE_PATCH(NAME, VALUE) \ |
1278 | params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE} | 1389 | params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE} |
1279 | WRITE_PATCH(PATCH_USER_CONST_XTILES, xtiles); | 1390 | WRITE_PATCH(PATCH_USER_CONST_XBLOCKS, xblocks); |
1280 | WRITE_PATCH(PATCH_USER_CONST_YTILES, ytiles); | 1391 | WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2, |
1281 | WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2, block_height_log2); | 1392 | block_height_log2); |
1282 | WRITE_PATCH(PATCH_USER_CONST_DSTPITCH, dst_stride); | 1393 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx); |
1283 | WRITE_PATCH(PATCH_USER_CONST_DSTOFFSET, | 1394 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy); |
1284 | (transpose ? 4 : 0) | g->cde_app.shader_parameter); | ||
1285 | WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_X, gridw); | ||
1286 | WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_Y, gridh); | ||
1287 | WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_Z, 1); | ||
1288 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx); | 1395 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx); |
1289 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy); | 1396 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy); |
1290 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1); | 1397 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1); |
1291 | WRITE_PATCH(PATCH_QMD_CTA_RASTER_WIDTH, gridw); | 1398 | |
1292 | WRITE_PATCH(PATCH_QMD_CTA_RASTER_HEIGHT, gridh); | 1399 | WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_WIDTH, gridw_h); |
1293 | WRITE_PATCH(PATCH_QMD_CTA_RASTER_DEPTH, 1); | 1400 | WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_HEIGHT, gridh_h); |
1294 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx); | 1401 | WRITE_PATCH(PATCH_H_USER_CONST_DSTOFFSET, 0); |
1295 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy); | 1402 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_X, gridw_h); |
1296 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION2, 1); | 1403 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Y, gridh_h); |
1404 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Z, 1); | ||
1405 | |||
1406 | WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_WIDTH, gridw_v); | ||
1407 | WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_HEIGHT, gridh_v); | ||
1408 | WRITE_PATCH(PATCH_V_USER_CONST_DSTOFFSET, voffset); | ||
1409 | WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_X, gridw_v); | ||
1410 | WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Y, gridh_v); | ||
1411 | WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Z, 1); | ||
1412 | |||
1413 | WRITE_PATCH(PATCH_H_QMD_PROGRAM_OFFSET, | ||
1414 | g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog]); | ||
1415 | WRITE_PATCH(PATCH_H_QMD_REGISTER_COUNT, | ||
1416 | g->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog]); | ||
1417 | WRITE_PATCH(PATCH_V_QMD_PROGRAM_OFFSET, | ||
1418 | g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog]); | ||
1419 | WRITE_PATCH(PATCH_V_QMD_REGISTER_COUNT, | ||
1420 | g->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]); | ||
1421 | |||
1422 | if (consumer & NVGPU_GPU_COMPBITS_CDEH) { | ||
1423 | WRITE_PATCH(PATCH_H_LAUNCH_WORD1, | ||
1424 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]); | ||
1425 | WRITE_PATCH(PATCH_H_LAUNCH_WORD2, | ||
1426 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]); | ||
1427 | } else { | ||
1428 | WRITE_PATCH(PATCH_H_LAUNCH_WORD1, | ||
1429 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]); | ||
1430 | WRITE_PATCH(PATCH_H_LAUNCH_WORD2, | ||
1431 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]); | ||
1432 | } | ||
1433 | |||
1434 | if (consumer & NVGPU_GPU_COMPBITS_CDEV) { | ||
1435 | WRITE_PATCH(PATCH_V_LAUNCH_WORD1, | ||
1436 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]); | ||
1437 | WRITE_PATCH(PATCH_V_LAUNCH_WORD2, | ||
1438 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]); | ||
1439 | } else { | ||
1440 | WRITE_PATCH(PATCH_V_LAUNCH_WORD1, | ||
1441 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]); | ||
1442 | WRITE_PATCH(PATCH_V_LAUNCH_WORD2, | ||
1443 | g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]); | ||
1444 | } | ||
1297 | #undef WRITE_PATCH | 1445 | #undef WRITE_PATCH |
1298 | 1446 | ||
1299 | err = gk20a_busy(g->dev); | ||
1300 | if (err) | ||
1301 | return err; | ||
1302 | err = gk20a_cde_convert(g, dmabuf, | 1447 | err = gk20a_cde_convert(g, dmabuf, |
1303 | 0, /* dst kind */ | 1448 | 0, /* dst kind */ |
1304 | compbits_offset, | 1449 | compbits_hoffset, |
1305 | 0, /* dst_size, 0 = auto */ | 1450 | 0, /* dst_size, 0 = auto */ |
1306 | fence_in, submit_flags, | 1451 | fence_in, submit_flags, |
1307 | params, param, fence_out); | 1452 | params, param, &new_fence); |
1453 | if (err) | ||
1454 | goto out; | ||
1455 | |||
1456 | /* compbits generated, update state & fence */ | ||
1457 | gk20a_fence_put(state->fence); | ||
1458 | state->fence = new_fence; | ||
1459 | state->valid_compbits |= consumer & | ||
1460 | (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV); | ||
1461 | out: | ||
1462 | return err; | ||
1463 | } | ||
1464 | |||
1465 | static int gk20a_buffer_convert_gpu_to_cde( | ||
1466 | struct gk20a *g, struct dma_buf *dmabuf, u32 consumer, | ||
1467 | u64 offset, u64 compbits_hoffset, u64 compbits_voffset, | ||
1468 | u32 width, u32 height, u32 block_height_log2, | ||
1469 | u32 submit_flags, struct nvgpu_fence *fence_in, | ||
1470 | struct gk20a_buffer_state *state) | ||
1471 | { | ||
1472 | int err = 0; | ||
1473 | |||
1474 | if (!g->cde_app.initialised) | ||
1475 | return -ENOSYS; | ||
1476 | |||
1477 | err = gk20a_busy(g->dev); | ||
1478 | if (err) | ||
1479 | return err; | ||
1480 | |||
1481 | gk20a_dbg(gpu_dbg_cde, "firmware version = %d\n", | ||
1482 | g->cde_app.firmware_version); | ||
1483 | |||
1484 | if (g->cde_app.firmware_version == 0) { | ||
1485 | err = gk20a_buffer_convert_gpu_to_cde_v0( | ||
1486 | g, dmabuf, consumer, offset, compbits_hoffset, | ||
1487 | compbits_voffset, width, height, block_height_log2, | ||
1488 | submit_flags, fence_in, state); | ||
1489 | } else { | ||
1490 | err = gk20a_buffer_convert_gpu_to_cde_v1( | ||
1491 | g, dmabuf, consumer, offset, compbits_hoffset, | ||
1492 | compbits_voffset, width, height, block_height_log2, | ||
1493 | submit_flags, fence_in, state); | ||
1494 | } | ||
1495 | |||
1308 | gk20a_idle(g->dev); | 1496 | gk20a_idle(g->dev); |
1309 | return err; | 1497 | return err; |
1310 | } | 1498 | } |
@@ -1326,7 +1514,8 @@ int gk20a_prepare_compressible_read( | |||
1326 | if (IS_ERR(dmabuf)) | 1514 | if (IS_ERR(dmabuf)) |
1327 | return -EINVAL; | 1515 | return -EINVAL; |
1328 | 1516 | ||
1329 | err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g), offset, &state); | 1517 | err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g), |
1518 | offset, &state); | ||
1330 | if (err) { | 1519 | if (err) { |
1331 | dma_buf_put(dmabuf); | 1520 | dma_buf_put(dmabuf); |
1332 | return err; | 1521 | return err; |
@@ -1345,40 +1534,20 @@ int gk20a_prepare_compressible_read( | |||
1345 | err = -EINVAL; | 1534 | err = -EINVAL; |
1346 | goto out; | 1535 | goto out; |
1347 | } else if (missing_bits) { | 1536 | } else if (missing_bits) { |
1348 | struct gk20a_fence *new_fence = NULL; | 1537 | u32 missing_cde_bits = missing_bits & |
1538 | (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV); | ||
1349 | if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) && | 1539 | if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) && |
1350 | (missing_bits & NVGPU_GPU_COMPBITS_CDEH)) { | 1540 | missing_cde_bits) { |
1351 | err = gk20a_buffer_convert_gpu_to_cde( | 1541 | err = gk20a_buffer_convert_gpu_to_cde( |
1352 | g, dmabuf, | 1542 | g, dmabuf, |
1353 | NVGPU_GPU_COMPBITS_CDEH, | 1543 | missing_cde_bits, |
1354 | offset, compbits_hoffset, | 1544 | offset, compbits_hoffset, |
1545 | compbits_voffset, | ||
1355 | width, height, block_height_log2, | 1546 | width, height, block_height_log2, |
1356 | submit_flags, fence, | 1547 | submit_flags, fence, |
1357 | &new_fence); | 1548 | state); |
1358 | if (err) | 1549 | if (err) |
1359 | goto out; | 1550 | goto out; |
1360 | |||
1361 | /* CDEH bits generated, update state & fence */ | ||
1362 | gk20a_fence_put(state->fence); | ||
1363 | state->fence = new_fence; | ||
1364 | state->valid_compbits |= NVGPU_GPU_COMPBITS_CDEH; | ||
1365 | } | ||
1366 | if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) && | ||
1367 | (missing_bits & NVGPU_GPU_COMPBITS_CDEV)) { | ||
1368 | err = gk20a_buffer_convert_gpu_to_cde( | ||
1369 | g, dmabuf, | ||
1370 | NVGPU_GPU_COMPBITS_CDEV, | ||
1371 | offset, compbits_voffset, | ||
1372 | width, height, block_height_log2, | ||
1373 | submit_flags, fence, | ||
1374 | &new_fence); | ||
1375 | if (err) | ||
1376 | goto out; | ||
1377 | |||
1378 | /* CDEH bits generated, update state & fence */ | ||
1379 | gk20a_fence_put(state->fence); | ||
1380 | state->fence = new_fence; | ||
1381 | state->valid_compbits |= NVGPU_GPU_COMPBITS_CDEV; | ||
1382 | } | 1551 | } |
1383 | } | 1552 | } |
1384 | 1553 | ||
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h index 3347490c..b160162c 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h | |||
@@ -23,8 +23,9 @@ | |||
23 | 23 | ||
24 | #define MAX_CDE_BUFS 10 | 24 | #define MAX_CDE_BUFS 10 |
25 | #define MAX_CDE_PARAMS 64 | 25 | #define MAX_CDE_PARAMS 64 |
26 | #define MAX_CDE_USER_PARAMS 32 | 26 | #define MAX_CDE_USER_PARAMS 40 |
27 | #define MAX_CDE_OBJ_IDS 4 | 27 | #define MAX_CDE_OBJ_IDS 4 |
28 | #define MAX_CDE_ARRAY_ENTRIES 9 | ||
28 | 29 | ||
29 | /* | 30 | /* |
30 | * The size of the context ring buffer that is dedicated for handling cde | 31 | * The size of the context ring buffer that is dedicated for handling cde |
@@ -162,6 +163,22 @@ struct gk20a_cde_cmd_elem { | |||
162 | }; | 163 | }; |
163 | 164 | ||
164 | /* | 165 | /* |
166 | * This element is used for storing a small array of data. | ||
167 | */ | ||
168 | |||
169 | enum { | ||
170 | ARRAY_PROGRAM_OFFSET = 0, | ||
171 | ARRAY_REGISTER_COUNT, | ||
172 | ARRAY_LAUNCH_COMMAND, | ||
173 | NUM_CDE_ARRAYS | ||
174 | }; | ||
175 | |||
176 | struct gk20a_cde_hdr_array { | ||
177 | u32 id; | ||
178 | u32 data[MAX_CDE_ARRAY_ENTRIES]; | ||
179 | }; | ||
180 | |||
181 | /* | ||
165 | * Following defines a single header element. Each element has a type and | 182 | * Following defines a single header element. Each element has a type and |
166 | * some of the data structures. | 183 | * some of the data structures. |
167 | */ | 184 | */ |
@@ -175,6 +192,7 @@ struct gk20a_cde_hdr_elem { | |||
175 | struct gk20a_cde_hdr_param param; | 192 | struct gk20a_cde_hdr_param param; |
176 | u32 required_class; | 193 | u32 required_class; |
177 | struct gk20a_cde_hdr_command command; | 194 | struct gk20a_cde_hdr_command command; |
195 | struct gk20a_cde_hdr_array array; | ||
178 | }; | 196 | }; |
179 | }; | 197 | }; |
180 | 198 | ||
@@ -183,7 +201,8 @@ enum { | |||
183 | TYPE_REPLACE, | 201 | TYPE_REPLACE, |
184 | TYPE_PARAM, | 202 | TYPE_PARAM, |
185 | TYPE_REQUIRED_CLASS, | 203 | TYPE_REQUIRED_CLASS, |
186 | TYPE_COMMAND | 204 | TYPE_COMMAND, |
205 | TYPE_ARRAY | ||
187 | }; | 206 | }; |
188 | 207 | ||
189 | struct gk20a_cde_mem_desc { | 208 | struct gk20a_cde_mem_desc { |
@@ -219,14 +238,12 @@ struct gk20a_cde_ctx { | |||
219 | /* storage for user space parameter values */ | 238 | /* storage for user space parameter values */ |
220 | u32 user_param_values[MAX_CDE_USER_PARAMS]; | 239 | u32 user_param_values[MAX_CDE_USER_PARAMS]; |
221 | 240 | ||
222 | u64 src_smmu_addr; | 241 | u32 surf_param_offset; |
223 | u32 src_param_offset; | 242 | u32 surf_param_lines; |
224 | u32 src_param_lines; | 243 | u64 surf_vaddr; |
225 | 244 | ||
226 | u64 src_vaddr; | 245 | u64 compbit_vaddr; |
227 | 246 | u64 compbit_size; | |
228 | u64 dest_vaddr; | ||
229 | u64 dest_size; | ||
230 | 247 | ||
231 | u32 obj_ids[MAX_CDE_OBJ_IDS]; | 248 | u32 obj_ids[MAX_CDE_OBJ_IDS]; |
232 | int num_obj_ids; | 249 | int num_obj_ids; |
@@ -259,6 +276,10 @@ struct gk20a_cde_app { | |||
259 | int ctx_usecount; | 276 | int ctx_usecount; |
260 | int ctx_count_top; | 277 | int ctx_count_top; |
261 | 278 | ||
279 | u32 firmware_version; | ||
280 | |||
281 | u32 arrays[NUM_CDE_ARRAYS][MAX_CDE_ARRAY_ENTRIES]; | ||
282 | |||
262 | u32 shader_parameter; | 283 | u32 shader_parameter; |
263 | }; | 284 | }; |
264 | 285 | ||
@@ -266,9 +287,9 @@ void gk20a_cde_destroy(struct gk20a *g); | |||
266 | void gk20a_cde_suspend(struct gk20a *g); | 287 | void gk20a_cde_suspend(struct gk20a *g); |
267 | int gk20a_init_cde_support(struct gk20a *g); | 288 | int gk20a_init_cde_support(struct gk20a *g); |
268 | int gk20a_cde_reload(struct gk20a *g); | 289 | int gk20a_cde_reload(struct gk20a *g); |
269 | int gk20a_cde_convert(struct gk20a *g, struct dma_buf *dst, | 290 | int gk20a_cde_convert(struct gk20a *g, struct dma_buf *compbits_buf, |
270 | s32 dst_kind, u64 dst_word_offset, | 291 | s32 compbits_kind, u64 compbits_word_offset, |
271 | u32 dst_size, struct nvgpu_fence *fence, | 292 | u32 compbits_size, struct nvgpu_fence *fence, |
272 | u32 __flags, struct gk20a_cde_param *params, | 293 | u32 __flags, struct gk20a_cde_param *params, |
273 | int num_params, struct gk20a_fence **fence_out); | 294 | int num_params, struct gk20a_fence **fence_out); |
274 | void gk20a_cde_debugfs_init(struct platform_device *dev); | 295 | void gk20a_cde_debugfs_init(struct platform_device *dev); |
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index a390e36b..08dd41c5 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c | |||
@@ -1546,7 +1546,7 @@ u64 gk20a_gmmu_map(struct vm_gk20a *vm, | |||
1546 | return vaddr; | 1546 | return vaddr; |
1547 | } | 1547 | } |
1548 | 1548 | ||
1549 | dma_addr_t gk20a_mm_gpuva_to_iova(struct vm_gk20a *vm, u64 gpu_vaddr) | 1549 | dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr) |
1550 | { | 1550 | { |
1551 | struct mapped_buffer_node *buffer; | 1551 | struct mapped_buffer_node *buffer; |
1552 | dma_addr_t addr = 0; | 1552 | dma_addr_t addr = 0; |
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index 3f7042ee..efed79f8 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h | |||
@@ -530,7 +530,7 @@ int gk20a_vm_map_buffer(struct gk20a_as_share *as_share, | |||
530 | int gk20a_vm_unmap_buffer(struct gk20a_as_share *, u64 offset); | 530 | int gk20a_vm_unmap_buffer(struct gk20a_as_share *, u64 offset); |
531 | void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf, | 531 | void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf, |
532 | struct gk20a_comptags *comptags); | 532 | struct gk20a_comptags *comptags); |
533 | dma_addr_t gk20a_mm_gpuva_to_iova(struct vm_gk20a *vm, u64 gpu_vaddr); | 533 | dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr); |
534 | 534 | ||
535 | int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev); | 535 | int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev); |
536 | 536 | ||