gpu: nvgpu: cde: CDE swizzling optimizations

Change CDE swizzling shader kernel size to 8x8 to avoid waste with relatively small surfaces. Map compbit backing store and destination surface as cacheable. Clean up kernel size calculation. Bug 1546619 Change-Id: Ie97c019b4137d2f2230da6ba3034387b1ab1468a Signed-off-by: Jussi Rasanen <jrasanen@nvidia.com> Reviewed-on: http://git-master/r/501158 Reviewed-by: Arto Merilainen <amerilainen@nvidia.com> Tested-by: Arto Merilainen <amerilainen@nvidia.com>
author: Jussi Rasanen <jrasanen@nvidia.com> 2014-09-22 05:05:23 -0400
committer: Dan Willemsen <dwillemsen@nvidia.com> 2015-03-18 15:11:28 -0400
commit: 47298dae35d37effed55c2b46e420186114ff83c (patch)
tree: 6c1fef3c6347d8a8a65b27ba20fa3fa940f99c9f /drivers
parent: ad39ba2b9e514f99dca588bad799ce63ac8022be (diff)
1 files changed, 10 insertions, 16 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index ad2ee159..3644c2ef 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -637,7 +637,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
        /* map the destination buffer */
        get_dma_buf(dst); /* a ref for gk20a_vm_map */
        dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0,
-                                 0, dst_kind, NULL, true,
+                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+                                 dst_kind, NULL, true,
                                 gk20a_mem_flag_none,
                                 0, 0);
        if (!dst_vaddr) {
@@ -654,7 +655,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
        /* map the source buffer to prevent premature release */
        get_dma_buf(src); /* a ref for gk20a_vm_map */
        src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0,
-                                 0, dst_kind, NULL, true,
+                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+                                 dst_kind, NULL, true,
                                 gk20a_mem_flag_none,
                                 0, 0);
        if (!src_vaddr) {
@@ -794,7 +796,8 @@ int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
        /* map backing store to gpu virtual space */
        vaddr = gk20a_gmmu_map(ch->vm, &gr->compbit_store.sgt,
-                               g->gr.compbit_store.size, 0,
+                               g->gr.compbit_store.size,
+                               NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
                               gk20a_mem_flag_read_only);
        if (!vaddr) {
@@ -991,16 +994,14 @@ static int gk20a_buffer_convert_gpu_to_cde(
        const int transposed_height = transpose ? width : height;
        const int xtiles = (transposed_width + 7) >> 3;
        const int ytiles = (transposed_height + 7) >> 3;
-        const int wgx = 16;
+        const int wgx = 8;
        const int wgy = 8;
        const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */
        const int dst_stride = 128; /* TODO chip constant */
        const int xalign = compbits_per_byte * wgx;
        const int yalign = wgy;
-        const int tilepitch = roundup(xtiles, xalign) / compbits_per_byte;
+        const int gridw = roundup(xtiles, xalign) / xalign;
-        const int ytilesaligned = roundup(ytiles, yalign);
+        const int gridh = roundup(ytiles, yalign) / yalign;
-        const int gridw = roundup(tilepitch, wgx) / wgx;
-        const int gridh = roundup(ytilesaligned, wgy) / wgy;
        if (!g->cde_app.initialised) {
                err = gk20a_cde_reload(g);
@@ -1015,17 +1016,10 @@ static int gk20a_buffer_convert_gpu_to_cde(
        gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx",
                  width, height, block_height_log2, compbits_offset);
        gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)",
-                  width, height, xtiles, ytiles, tilepitch, ytilesaligned);
+                  width, height, xtiles, ytiles, gridw*wgx, gridh*wgy);
        gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)",
                  wgx, wgy, gridw, gridh);
-        if (tilepitch % wgx != 0 || ytilesaligned % wgy != 0) {
-                gk20a_warn(&g->dev->dev,
-                        "grid size (%d, %d) is not a multiple of work group size (%d, %d)",
-                        tilepitch, ytilesaligned, wgx, wgy);
-                return -EINVAL;
-        }
        /* Write parameters */
 #define WRITE_PATCH(NAME, VALUE) \
                params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE}
author	Jussi Rasanen <jrasanen@nvidia.com>	2014-09-22 05:05:23 -0400
committer	Dan Willemsen <dwillemsen@nvidia.com>	2015-03-18 15:11:28 -0400
commit	47298dae35d37effed55c2b46e420186114ff83c (patch)
tree	6c1fef3c6347d8a8a65b27ba20fa3fa940f99c9f /drivers
parent	ad39ba2b9e514f99dca588bad799ce63ac8022be (diff)