From 47298dae35d37effed55c2b46e420186114ff83c Mon Sep 17 00:00:00 2001
From: Jussi Rasanen <jrasanen@nvidia.com>
Date: Mon, 22 Sep 2014 12:05:23 +0300
Subject: gpu: nvgpu: cde: CDE swizzling optimizations

Change CDE swizzling shader kernel size to 8x8 to avoid waste with
relatively small surfaces.

Map compbit backing store and destination surface as cacheable.

Clean up kernel size calculation.

Bug 1546619

Change-Id: Ie97c019b4137d2f2230da6ba3034387b1ab1468a
Signed-off-by: Jussi Rasanen <jrasanen@nvidia.com>
Reviewed-on: http://git-master/r/501158
Reviewed-by: Arto Merilainen <amerilainen@nvidia.com>
Tested-by: Arto Merilainen <amerilainen@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/cde_gk20a.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'drivers')

diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index ad2ee159..3644c2ef 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -637,7 +637,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
 	/* map the destination buffer */
 	get_dma_buf(dst); /* a ref for gk20a_vm_map */
 	dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0,
-				 0, dst_kind, NULL, true,
+				 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				 dst_kind, NULL, true,
 				 gk20a_mem_flag_none,
 				 0, 0);
 	if (!dst_vaddr) {
@@ -654,7 +655,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
 	/* map the source buffer to prevent premature release */
 	get_dma_buf(src); /* a ref for gk20a_vm_map */
 	src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0,
-				 0, dst_kind, NULL, true,
+				 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				 dst_kind, NULL, true,
 				 gk20a_mem_flag_none,
 				 0, 0);
 	if (!src_vaddr) {
@@ -794,7 +796,8 @@ int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
 
 	/* map backing store to gpu virtual space */
 	vaddr = gk20a_gmmu_map(ch->vm, &gr->compbit_store.sgt,
-			       g->gr.compbit_store.size, 0,
+			       g->gr.compbit_store.size,
+			       NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
 			       gk20a_mem_flag_read_only);
 
 	if (!vaddr) {
@@ -991,16 +994,14 @@ static int gk20a_buffer_convert_gpu_to_cde(
 	const int transposed_height = transpose ? width : height;
 	const int xtiles = (transposed_width + 7) >> 3;
 	const int ytiles = (transposed_height + 7) >> 3;
-	const int wgx = 16;
+	const int wgx = 8;
 	const int wgy = 8;
 	const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */
 	const int dst_stride = 128; /* TODO chip constant */
 	const int xalign = compbits_per_byte * wgx;
 	const int yalign = wgy;
-	const int tilepitch = roundup(xtiles, xalign) / compbits_per_byte;
-	const int ytilesaligned = roundup(ytiles, yalign);
-	const int gridw = roundup(tilepitch, wgx) / wgx;
-	const int gridh = roundup(ytilesaligned, wgy) / wgy;
+	const int gridw = roundup(xtiles, xalign) / xalign;
+	const int gridh = roundup(ytiles, yalign) / yalign;
 
 	if (!g->cde_app.initialised) {
 		err = gk20a_cde_reload(g);
@@ -1015,17 +1016,10 @@ static int gk20a_buffer_convert_gpu_to_cde(
 	gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx",
 		  width, height, block_height_log2, compbits_offset);
 	gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)",
-		  width, height, xtiles, ytiles, tilepitch, ytilesaligned);
+		  width, height, xtiles, ytiles, gridw*wgx, gridh*wgy);
 	gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)",
 		  wgx, wgy, gridw, gridh);
 
-	if (tilepitch % wgx != 0 || ytilesaligned % wgy != 0) {
-		gk20a_warn(&g->dev->dev,
-			"grid size (%d, %d) is not a multiple of work group size (%d, %d)",
-			tilepitch, ytilesaligned, wgx, wgy);
-		return -EINVAL;
-	}
-
 	/* Write parameters */
 #define WRITE_PATCH(NAME, VALUE) \
 		params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE}
-- 
cgit v1.2.2