summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
diff options
context:
space:
mode:
authorLauri Peltonen <lpeltonen@nvidia.com>2014-07-18 09:02:23 -0400
committerDan Willemsen <dwillemsen@nvidia.com>2015-03-18 15:10:44 -0400
commit574ee40e51bf3f4fe989f8e572e611ae4ffa0795 (patch)
tree4083fb74ed6861d679299131f3577c09c33ff99d /drivers/gpu/nvgpu/gk20a/cde_gk20a.c
parentc8faa10d1dc9bb0c4c2815c38fb71d8acdd1108d (diff)
gpu: nvgpu: Add compression state IOCTLs
Bug 1409151 Change-Id: I29a325d7c2b481764fc82d945795d50bcb841961 Signed-off-by: Lauri Peltonen <lpeltonen@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/cde_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/cde_gk20a.c335
1 files changed, 312 insertions, 23 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index d01426be..46568879 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -19,6 +19,7 @@
19#include <linux/nvhost.h> 19#include <linux/nvhost.h>
20#include <linux/dma-mapping.h> 20#include <linux/dma-mapping.h>
21#include <linux/firmware.h> 21#include <linux/firmware.h>
22#include <linux/fs.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
23#include <linux/dma-buf.h> 24#include <linux/dma-buf.h>
24 25
@@ -596,7 +597,8 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
596 num_entries, flags, fence, fence_out); 597 num_entries, flags, fence, fence_out);
597} 598}
598 599
599int gk20a_cde_convert(struct gk20a *g, u32 src_fd, u32 dst_fd, 600int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
601 struct dma_buf *dst,
600 s32 dst_kind, u64 dst_byte_offset, 602 s32 dst_kind, u64 dst_byte_offset,
601 u32 dst_size, struct nvhost_fence *fence, 603 u32 dst_size, struct nvhost_fence *fence,
602 u32 __flags, struct gk20a_cde_param *params, 604 u32 __flags, struct gk20a_cde_param *params,
@@ -605,7 +607,6 @@ int gk20a_cde_convert(struct gk20a *g, u32 src_fd, u32 dst_fd,
605 struct gk20a_cde_app *cde_app = &g->cde_app; 607 struct gk20a_cde_app *cde_app = &g->cde_app;
606 struct gk20a_comptags comptags; 608 struct gk20a_comptags comptags;
607 struct gk20a_cde_ctx *cde_ctx; 609 struct gk20a_cde_ctx *cde_ctx;
608 struct dma_buf *src = NULL, *dst = NULL;
609 u64 dst_vaddr = 0, src_vaddr = 0; 610 u64 dst_vaddr = 0, src_vaddr = 0;
610 u32 flags; 611 u32 flags;
611 int err, i; 612 int err, i;
@@ -622,14 +623,7 @@ int gk20a_cde_convert(struct gk20a *g, u32 src_fd, u32 dst_fd,
622 cde_app->cde_ctx_ptr = (cde_app->cde_ctx_ptr + 1) % 623 cde_app->cde_ctx_ptr = (cde_app->cde_ctx_ptr + 1) %
623 ARRAY_SIZE(cde_app->cde_ctx); 624 ARRAY_SIZE(cde_app->cde_ctx);
624 625
625 /* First, get buffer references and map the buffers to local va */ 626 /* First, map the buffers to local va */
626
627 dst = dma_buf_get(dst_fd);
628 if (IS_ERR(src)) {
629 dst = NULL;
630 err = -EINVAL;
631 goto exit_unlock;
632 }
633 627
634 /* ensure that the dst buffer has drvdata */ 628 /* ensure that the dst buffer has drvdata */
635 err = gk20a_dmabuf_alloc_drvdata(dst, &g->dev->dev); 629 err = gk20a_dmabuf_alloc_drvdata(dst, &g->dev->dev);
@@ -637,18 +631,13 @@ int gk20a_cde_convert(struct gk20a *g, u32 src_fd, u32 dst_fd,
637 goto exit_unlock; 631 goto exit_unlock;
638 632
639 /* map the destination buffer */ 633 /* map the destination buffer */
634 get_dma_buf(dst); /* a ref for gk20a_vm_map */
640 dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0, 635 dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0,
641 0, dst_kind, NULL, true, 636 0, dst_kind, NULL, true,
642 gk20a_mem_flag_none, 637 gk20a_mem_flag_none,
643 0, 0); 638 0, 0);
644 if (!dst_vaddr) { 639 if (!dst_vaddr) {
645 err = -EINVAL; 640 dma_buf_put(dst);
646 goto exit_unlock;
647 }
648
649 src = dma_buf_get(src_fd);
650 if (IS_ERR(src)) {
651 src = NULL;
652 err = -EINVAL; 641 err = -EINVAL;
653 goto exit_unlock; 642 goto exit_unlock;
654 } 643 }
@@ -659,11 +648,13 @@ int gk20a_cde_convert(struct gk20a *g, u32 src_fd, u32 dst_fd,
659 goto exit_unlock; 648 goto exit_unlock;
660 649
661 /* map the source buffer to prevent premature release */ 650 /* map the source buffer to prevent premature release */
651 get_dma_buf(src); /* a ref for gk20a_vm_map */
662 src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0, 652 src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0,
663 0, dst_kind, NULL, true, 653 0, dst_kind, NULL, true,
664 gk20a_mem_flag_none, 654 gk20a_mem_flag_none,
665 0, 0); 655 0, 0);
666 if (!src_vaddr) { 656 if (!src_vaddr) {
657 dma_buf_put(src);
667 err = -EINVAL; 658 err = -EINVAL;
668 goto exit_unlock; 659 goto exit_unlock;
669 } 660 }
@@ -765,12 +756,6 @@ exit_unlock:
765 if (src_vaddr) 756 if (src_vaddr)
766 gk20a_vm_unmap(g->cde_app.vm, src_vaddr); 757 gk20a_vm_unmap(g->cde_app.vm, src_vaddr);
767 758
768 /* drop dmabuf refs if work was aborted */
769 if (err && src)
770 dma_buf_put(src);
771 if (err && dst)
772 dma_buf_put(dst);
773
774 mutex_unlock(&cde_app->mutex); 759 mutex_unlock(&cde_app->mutex);
775 760
776 return err; 761 return err;
@@ -922,3 +907,307 @@ err_init_instance:
922 } 907 }
923 return ret; 908 return ret;
924} 909}
910
911enum cde_launch_patch_offset {
912 /* dst buffer width in roptiles */
913 PATCH_USER_CONST_XTILES,
914 /* dst buffer height in roptiles */
915 PATCH_USER_CONST_YTILES,
916 /* dst buffer log2(block height) */
917 PATCH_USER_CONST_BLOCKHEIGHTLOG2,
918 /* dst buffer pitch in bytes */
919 PATCH_USER_CONST_DSTPITCH,
920 /* dst buffer write offset */
921 PATCH_USER_CONST_DSTOFFSET,
922 /* comp cache index of the first page of the surface,
923 * kernel looks it up from PTE */
924 PATCH_USER_CONST_FIRSTPAGEOFFSET,
925 /* gmmu translated surface address, kernel fills */
926 PATCH_USER_CONST_SURFADDR,
927 /* dst buffer address >> 8, kernel fills */
928 PATCH_VPC_DSTIMAGE_ADDR,
929 /* dst buffer address >> 8, kernel fills */
930 PATCH_VPC_DSTIMAGE_ADDR2,
931 /* dst buffer size - 1, kernel fills */
932 PATCH_VPC_DSTIMAGE_SIZE_MINUS_ONE,
933 /* dst buffer size - 1, kernel fills */
934 PATCH_VPC_DSTIMAGE_SIZE_MINUS_ONE2,
935 /* dst buffer size, kernel fills */
936 PATCH_VPC_DSTIMAGE_SIZE,
937 /* dst buffer width in roptiles / work group width */
938 PATCH_VPC_CURRENT_GRID_SIZE_X,
939 /* dst buffer height in roptiles / work group height */
940 PATCH_VPC_CURRENT_GRID_SIZE_Y,
941 /* 1 */
942 PATCH_VPC_CURRENT_GRID_SIZE_Z,
943 /* work group width, 16 seems to be quite optimal */
944 PATCH_VPC_CURRENT_GROUP_SIZE_X,
945 /* work group height, 8 seems to be quite optimal */
946 PATCH_VPC_CURRENT_GROUP_SIZE_Y,
947 /* 1 */
948 PATCH_VPC_CURRENT_GROUP_SIZE_Z,
949 /* same as PATCH_VPC_CURRENT_GRID_SIZE_X */
950 PATCH_QMD_CTA_RASTER_WIDTH,
951 /* same as PATCH_VPC_CURRENT_GRID_SIZE_Y */
952 PATCH_QMD_CTA_RASTER_HEIGHT,
953 /* same as PATCH_VPC_CURRENT_GRID_SIZE_Z */
954 PATCH_QMD_CTA_RASTER_DEPTH,
955 /* same as PATCH_VPC_CURRENT_GROUP_SIZE_X */
956 PATCH_QMD_CTA_THREAD_DIMENSION0,
957 /* same as PATCH_VPC_CURRENT_GROUP_SIZE_Y */
958 PATCH_QMD_CTA_THREAD_DIMENSION1,
959 /* same as PATCH_VPC_CURRENT_GROUP_SIZE_Z */
960 PATCH_QMD_CTA_THREAD_DIMENSION2,
961
962 NUM_CDE_LAUNCH_PATCHES
963};
964
965enum cde_launch_patch_id {
966 PATCH_QMD_CTA_RASTER_WIDTH_ID = 1024,
967 PATCH_QMD_CTA_RASTER_HEIGHT_ID = 1025,
968 PATCH_QMD_CTA_RASTER_DEPTH_ID = 1026,
969 PATCH_QMD_CTA_THREAD_DIMENSION0_ID = 1027,
970 PATCH_QMD_CTA_THREAD_DIMENSION1_ID = 1028,
971 PATCH_QMD_CTA_THREAD_DIMENSION2_ID = 1029,
972 PATCH_USER_CONST_XTILES_ID = 1030,
973 PATCH_USER_CONST_YTILES_ID = 1031,
974 PATCH_USER_CONST_BLOCKHEIGHTLOG2_ID = 1032,
975 PATCH_USER_CONST_DSTPITCH_ID = 1033,
976 PATCH_USER_CONST_DSTOFFSET_ID = 1034,
977 PATCH_VPC_CURRENT_GRID_SIZE_X_ID = 1035,
978 PATCH_VPC_CURRENT_GRID_SIZE_Y_ID = 1036,
979 PATCH_VPC_CURRENT_GRID_SIZE_Z_ID = 1037,
980 PATCH_VPC_CURRENT_GROUP_SIZE_X_ID = 1038,
981 PATCH_VPC_CURRENT_GROUP_SIZE_Y_ID = 1039,
982 PATCH_VPC_CURRENT_GROUP_SIZE_Z_ID = 1040,
983};
984
985static int gk20a_buffer_convert_gpu_to_cde(
986 struct gk20a *g, struct dma_buf *dmabuf, u32 consumer,
987 u64 offset, u64 compbits_offset,
988 u32 width, u32 height, u32 block_height_log2,
989 u32 submit_flags, struct nvhost_fence *fence_in,
990 struct gk20a_fence **fence_out)
991{
992 struct gk20a_cde_param params[NUM_CDE_LAUNCH_PATCHES];
993 int param = 0;
994 int err = 0;
995
996 /* Compute per launch parameters */
997 const bool transpose = (consumer == NVHOST_GPU_COMPBITS_CDEV);
998 const int transposed_width = transpose ? height : width;
999 const int transposed_height = transpose ? width : height;
1000 const int xtiles = (transposed_width + 7) >> 3;
1001 const int ytiles = (transposed_height + 7) >> 3;
1002 const int wgx = 16;
1003 const int wgy = 8;
1004 const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */
1005 const int dst_stride = 128; /* TODO chip constant */
1006 const int xalign = compbits_per_byte * wgx;
1007 const int yalign = wgy;
1008 const int tilepitch = roundup(xtiles, xalign) / compbits_per_byte;
1009 const int ytilesaligned = roundup(ytiles, yalign);
1010 const int gridw = roundup(tilepitch, wgx) / wgx;
1011 const int gridh = roundup(ytilesaligned, wgy) / wgy;
1012
1013 if (xtiles > 4096 / 8 || ytiles > 4096 / 8) {
1014 gk20a_warn(&g->dev->dev, "cde: too large surface");
1015 return -EINVAL;
1016 }
1017
1018 gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx",
1019 width, height, block_height_log2, compbits_offset);
1020 gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)",
1021 width, height, xtiles, ytiles, tilepitch, ytilesaligned);
1022 gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)",
1023 wgx, wgy, gridw, gridh);
1024
1025 if (tilepitch % wgx != 0 || ytilesaligned % wgy != 0) {
1026 gk20a_warn(&g->dev->dev,
1027 "grid size (%d, %d) is not a multiple of work group size (%d, %d)",
1028 tilepitch, ytilesaligned, wgx, wgy);
1029 return -EINVAL;
1030 }
1031
1032 /* Write parameters */
1033#define WRITE_PATCH(NAME, VALUE) \
1034 params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE}
1035 WRITE_PATCH(PATCH_USER_CONST_XTILES, xtiles);
1036 WRITE_PATCH(PATCH_USER_CONST_YTILES, ytiles);
1037 WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2, block_height_log2);
1038 WRITE_PATCH(PATCH_USER_CONST_DSTPITCH, dst_stride);
1039 WRITE_PATCH(PATCH_USER_CONST_DSTOFFSET, transpose ? 4 : 0); /* flag */
1040 WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_X, gridw);
1041 WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_Y, gridh);
1042 WRITE_PATCH(PATCH_VPC_CURRENT_GRID_SIZE_Z, 1);
1043 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx);
1044 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy);
1045 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1);
1046 WRITE_PATCH(PATCH_QMD_CTA_RASTER_WIDTH, gridw);
1047 WRITE_PATCH(PATCH_QMD_CTA_RASTER_HEIGHT, gridh);
1048 WRITE_PATCH(PATCH_QMD_CTA_RASTER_DEPTH, 1);
1049 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx);
1050 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy);
1051 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION2, 1);
1052#undef WRITE_PATCH
1053
1054 gk20a_busy(g->dev);
1055 err = gk20a_init_cde_support(g);
1056 if (err)
1057 goto out;
1058 err = gk20a_cde_convert(g, dmabuf, dmabuf,
1059 0, /* dst kind */
1060 compbits_offset,
1061 0, /* dst_size, 0 = auto */
1062 fence_in, submit_flags,
1063 params, param, fence_out);
1064out:
1065 gk20a_idle(g->dev);
1066 return err;
1067}
1068
1069int gk20a_prepare_compressible_read(
1070 struct gk20a *g, u32 buffer_fd, u32 request, u64 offset,
1071 u64 compbits_hoffset, u64 compbits_voffset,
1072 u32 width, u32 height, u32 block_height_log2,
1073 u32 submit_flags, struct nvhost_fence *fence,
1074 u32 *valid_compbits, struct gk20a_fence **fence_out)
1075{
1076 int err = 0;
1077 struct gk20a_buffer_state *state;
1078 struct dma_buf *dmabuf;
1079 u32 missing_bits;
1080
1081 if (!g->cde_app.initialised) {
1082 err = gk20a_cde_reload(g);
1083 if (err)
1084 return err;
1085 }
1086
1087 dmabuf = dma_buf_get(buffer_fd);
1088 if (IS_ERR(dmabuf))
1089 return -EINVAL;
1090
1091 err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g), offset, &state);
1092 if (err) {
1093 dma_buf_put(dmabuf);
1094 return err;
1095 }
1096
1097 missing_bits = (state->valid_compbits ^ request) & request;
1098
1099 mutex_lock(&state->lock);
1100
1101 if (state->valid_compbits && request == NVHOST_GPU_COMPBITS_NONE) {
1102
1103 gk20a_fence_put(state->fence);
1104 state->fence = NULL;
1105 /* state->fence = decompress();
1106 state->valid_compbits = 0; */
1107 err = -EINVAL;
1108 goto out;
1109 } else if (missing_bits) {
1110 struct gk20a_fence *new_fence = NULL;
1111 if ((state->valid_compbits & NVHOST_GPU_COMPBITS_GPU) &&
1112 (missing_bits & NVHOST_GPU_COMPBITS_CDEH)) {
1113 err = gk20a_buffer_convert_gpu_to_cde(
1114 g, dmabuf,
1115 NVHOST_GPU_COMPBITS_CDEH,
1116 offset, compbits_hoffset,
1117 width, height, block_height_log2,
1118 submit_flags, fence,
1119 &new_fence);
1120 if (err)
1121 goto out;
1122
1123 /* CDEH bits generated, update state & fence */
1124 gk20a_fence_put(state->fence);
1125 state->fence = new_fence;
1126 state->valid_compbits |= NVHOST_GPU_COMPBITS_CDEH;
1127 }
1128 if ((state->valid_compbits & NVHOST_GPU_COMPBITS_GPU) &&
1129 (missing_bits & NVHOST_GPU_COMPBITS_CDEV)) {
1130 err = gk20a_buffer_convert_gpu_to_cde(
1131 g, dmabuf,
1132 NVHOST_GPU_COMPBITS_CDEV,
1133 offset, compbits_voffset,
1134 width, height, block_height_log2,
1135 submit_flags, fence,
1136 &new_fence);
1137 if (err)
1138 goto out;
1139
1140 /* CDEH bits generated, update state & fence */
1141 gk20a_fence_put(state->fence);
1142 state->fence = new_fence;
1143 state->valid_compbits |= NVHOST_GPU_COMPBITS_CDEV;
1144 }
1145 }
1146
1147 if (state->fence && fence_out)
1148 *fence_out = gk20a_fence_get(state->fence);
1149
1150 if (valid_compbits)
1151 *valid_compbits = state->valid_compbits;
1152
1153out:
1154 mutex_unlock(&state->lock);
1155 dma_buf_put(dmabuf);
1156 return 0;
1157}
1158
1159int gk20a_mark_compressible_write(struct gk20a *g, u32 buffer_fd,
1160 u32 valid_compbits, u64 offset)
1161{
1162 int err;
1163 struct gk20a_buffer_state *state;
1164 struct dma_buf *dmabuf;
1165
1166 dmabuf = dma_buf_get(buffer_fd);
1167 if (IS_ERR(dmabuf)) {
1168 dev_err(dev_from_gk20a(g), "invalid dmabuf");
1169 return -EINVAL;
1170 }
1171
1172 err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g), offset, &state);
1173 if (err) {
1174 dev_err(dev_from_gk20a(g), "could not get state from dmabuf");
1175 dma_buf_put(dmabuf);
1176 return err;
1177 }
1178
1179 mutex_lock(&state->lock);
1180
1181 /* Update the compbits state. */
1182 state->valid_compbits = valid_compbits;
1183
1184 /* Discard previous compbit job fence. */
1185 gk20a_fence_put(state->fence);
1186 state->fence = NULL;
1187
1188 mutex_unlock(&state->lock);
1189 dma_buf_put(dmabuf);
1190 return 0;
1191}
1192
1193static ssize_t gk20a_cde_reload_write(struct file *file,
1194 const char __user *userbuf, size_t count, loff_t *ppos)
1195{
1196 struct gk20a *g = file->private_data;
1197 gk20a_cde_reload(g);
1198 return count;
1199}
1200
1201static const struct file_operations gk20a_cde_reload_fops = {
1202 .open = simple_open,
1203 .write = gk20a_cde_reload_write,
1204};
1205
1206void gk20a_cde_debugfs_init(struct platform_device *dev)
1207{
1208 struct gk20a_platform *platform = platform_get_drvdata(dev);
1209 struct gk20a *g = get_gk20a(dev);
1210
1211 debugfs_create_file("reload_cde_firmware", S_IWUSR, platform->debugfs,
1212 g, &gk20a_cde_reload_fops);
1213}