summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorJussi Rasanen <jrasanen@nvidia.com>2014-09-22 05:05:23 -0400
committerDan Willemsen <dwillemsen@nvidia.com>2015-03-18 15:11:28 -0400
commit47298dae35d37effed55c2b46e420186114ff83c (patch)
tree6c1fef3c6347d8a8a65b27ba20fa3fa940f99c9f /drivers
parentad39ba2b9e514f99dca588bad799ce63ac8022be (diff)
gpu: nvgpu: cde: CDE swizzling optimizations
Change CDE swizzling shader kernel size to 8x8 to avoid waste with relatively small surfaces. Map compbit backing store and destination surface as cacheable. Clean up kernel size calculation. Bug 1546619 Change-Id: Ie97c019b4137d2f2230da6ba3034387b1ab1468a Signed-off-by: Jussi Rasanen <jrasanen@nvidia.com> Reviewed-on: http://git-master/r/501158 Reviewed-by: Arto Merilainen <amerilainen@nvidia.com> Tested-by: Arto Merilainen <amerilainen@nvidia.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/nvgpu/gk20a/cde_gk20a.c26
1 files changed, 10 insertions, 16 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index ad2ee159..3644c2ef 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -637,7 +637,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
637 /* map the destination buffer */ 637 /* map the destination buffer */
638 get_dma_buf(dst); /* a ref for gk20a_vm_map */ 638 get_dma_buf(dst); /* a ref for gk20a_vm_map */
639 dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0, 639 dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0,
640 0, dst_kind, NULL, true, 640 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
641 dst_kind, NULL, true,
641 gk20a_mem_flag_none, 642 gk20a_mem_flag_none,
642 0, 0); 643 0, 0);
643 if (!dst_vaddr) { 644 if (!dst_vaddr) {
@@ -654,7 +655,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
654 /* map the source buffer to prevent premature release */ 655 /* map the source buffer to prevent premature release */
655 get_dma_buf(src); /* a ref for gk20a_vm_map */ 656 get_dma_buf(src); /* a ref for gk20a_vm_map */
656 src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0, 657 src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0,
657 0, dst_kind, NULL, true, 658 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
659 dst_kind, NULL, true,
658 gk20a_mem_flag_none, 660 gk20a_mem_flag_none,
659 0, 0); 661 0, 0);
660 if (!src_vaddr) { 662 if (!src_vaddr) {
@@ -794,7 +796,8 @@ int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
794 796
795 /* map backing store to gpu virtual space */ 797 /* map backing store to gpu virtual space */
796 vaddr = gk20a_gmmu_map(ch->vm, &gr->compbit_store.sgt, 798 vaddr = gk20a_gmmu_map(ch->vm, &gr->compbit_store.sgt,
797 g->gr.compbit_store.size, 0, 799 g->gr.compbit_store.size,
800 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
798 gk20a_mem_flag_read_only); 801 gk20a_mem_flag_read_only);
799 802
800 if (!vaddr) { 803 if (!vaddr) {
@@ -991,16 +994,14 @@ static int gk20a_buffer_convert_gpu_to_cde(
991 const int transposed_height = transpose ? width : height; 994 const int transposed_height = transpose ? width : height;
992 const int xtiles = (transposed_width + 7) >> 3; 995 const int xtiles = (transposed_width + 7) >> 3;
993 const int ytiles = (transposed_height + 7) >> 3; 996 const int ytiles = (transposed_height + 7) >> 3;
994 const int wgx = 16; 997 const int wgx = 8;
995 const int wgy = 8; 998 const int wgy = 8;
996 const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ 999 const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */
997 const int dst_stride = 128; /* TODO chip constant */ 1000 const int dst_stride = 128; /* TODO chip constant */
998 const int xalign = compbits_per_byte * wgx; 1001 const int xalign = compbits_per_byte * wgx;
999 const int yalign = wgy; 1002 const int yalign = wgy;
1000 const int tilepitch = roundup(xtiles, xalign) / compbits_per_byte; 1003 const int gridw = roundup(xtiles, xalign) / xalign;
1001 const int ytilesaligned = roundup(ytiles, yalign); 1004 const int gridh = roundup(ytiles, yalign) / yalign;
1002 const int gridw = roundup(tilepitch, wgx) / wgx;
1003 const int gridh = roundup(ytilesaligned, wgy) / wgy;
1004 1005
1005 if (!g->cde_app.initialised) { 1006 if (!g->cde_app.initialised) {
1006 err = gk20a_cde_reload(g); 1007 err = gk20a_cde_reload(g);
@@ -1015,17 +1016,10 @@ static int gk20a_buffer_convert_gpu_to_cde(
1015 gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx", 1016 gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx",
1016 width, height, block_height_log2, compbits_offset); 1017 width, height, block_height_log2, compbits_offset);
1017 gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)", 1018 gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)",
1018 width, height, xtiles, ytiles, tilepitch, ytilesaligned); 1019 width, height, xtiles, ytiles, gridw*wgx, gridh*wgy);
1019 gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)", 1020 gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)",
1020 wgx, wgy, gridw, gridh); 1021 wgx, wgy, gridw, gridh);
1021 1022
1022 if (tilepitch % wgx != 0 || ytilesaligned % wgy != 0) {
1023 gk20a_warn(&g->dev->dev,
1024 "grid size (%d, %d) is not a multiple of work group size (%d, %d)",
1025 tilepitch, ytilesaligned, wgx, wgy);
1026 return -EINVAL;
1027 }
1028
1029 /* Write parameters */ 1023 /* Write parameters */
1030#define WRITE_PATCH(NAME, VALUE) \ 1024#define WRITE_PATCH(NAME, VALUE) \
1031 params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE} 1025 params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE}