diff options
author | Jussi Rasanen <jrasanen@nvidia.com> | 2014-09-22 05:05:23 -0400 |
---|---|---|
committer | Dan Willemsen <dwillemsen@nvidia.com> | 2015-03-18 15:11:28 -0400 |
commit | 47298dae35d37effed55c2b46e420186114ff83c (patch) | |
tree | 6c1fef3c6347d8a8a65b27ba20fa3fa940f99c9f /drivers | |
parent | ad39ba2b9e514f99dca588bad799ce63ac8022be (diff) |
gpu: nvgpu: cde: CDE swizzling optimizations
Change CDE swizzling shader kernel size to 8x8 to avoid waste with
relatively small surfaces.
Map compbit backing store and destination surface as cacheable.
Clean up kernel size calculation.
Bug 1546619
Change-Id: Ie97c019b4137d2f2230da6ba3034387b1ab1468a
Signed-off-by: Jussi Rasanen <jrasanen@nvidia.com>
Reviewed-on: http://git-master/r/501158
Reviewed-by: Arto Merilainen <amerilainen@nvidia.com>
Tested-by: Arto Merilainen <amerilainen@nvidia.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/cde_gk20a.c | 26 |
1 files changed, 10 insertions, 16 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c index ad2ee159..3644c2ef 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c | |||
@@ -637,7 +637,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, | |||
637 | /* map the destination buffer */ | 637 | /* map the destination buffer */ |
638 | get_dma_buf(dst); /* a ref for gk20a_vm_map */ | 638 | get_dma_buf(dst); /* a ref for gk20a_vm_map */ |
639 | dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0, | 639 | dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0, |
640 | 0, dst_kind, NULL, true, | 640 | NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, |
641 | dst_kind, NULL, true, | ||
641 | gk20a_mem_flag_none, | 642 | gk20a_mem_flag_none, |
642 | 0, 0); | 643 | 0, 0); |
643 | if (!dst_vaddr) { | 644 | if (!dst_vaddr) { |
@@ -654,7 +655,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, | |||
654 | /* map the source buffer to prevent premature release */ | 655 | /* map the source buffer to prevent premature release */ |
655 | get_dma_buf(src); /* a ref for gk20a_vm_map */ | 656 | get_dma_buf(src); /* a ref for gk20a_vm_map */ |
656 | src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0, | 657 | src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0, |
657 | 0, dst_kind, NULL, true, | 658 | NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, |
659 | dst_kind, NULL, true, | ||
658 | gk20a_mem_flag_none, | 660 | gk20a_mem_flag_none, |
659 | 0, 0); | 661 | 0, 0); |
660 | if (!src_vaddr) { | 662 | if (!src_vaddr) { |
@@ -794,7 +796,8 @@ int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx) | |||
794 | 796 | ||
795 | /* map backing store to gpu virtual space */ | 797 | /* map backing store to gpu virtual space */ |
796 | vaddr = gk20a_gmmu_map(ch->vm, &gr->compbit_store.sgt, | 798 | vaddr = gk20a_gmmu_map(ch->vm, &gr->compbit_store.sgt, |
797 | g->gr.compbit_store.size, 0, | 799 | g->gr.compbit_store.size, |
800 | NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, | ||
798 | gk20a_mem_flag_read_only); | 801 | gk20a_mem_flag_read_only); |
799 | 802 | ||
800 | if (!vaddr) { | 803 | if (!vaddr) { |
@@ -991,16 +994,14 @@ static int gk20a_buffer_convert_gpu_to_cde( | |||
991 | const int transposed_height = transpose ? width : height; | 994 | const int transposed_height = transpose ? width : height; |
992 | const int xtiles = (transposed_width + 7) >> 3; | 995 | const int xtiles = (transposed_width + 7) >> 3; |
993 | const int ytiles = (transposed_height + 7) >> 3; | 996 | const int ytiles = (transposed_height + 7) >> 3; |
994 | const int wgx = 16; | 997 | const int wgx = 8; |
995 | const int wgy = 8; | 998 | const int wgy = 8; |
996 | const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ | 999 | const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ |
997 | const int dst_stride = 128; /* TODO chip constant */ | 1000 | const int dst_stride = 128; /* TODO chip constant */ |
998 | const int xalign = compbits_per_byte * wgx; | 1001 | const int xalign = compbits_per_byte * wgx; |
999 | const int yalign = wgy; | 1002 | const int yalign = wgy; |
1000 | const int tilepitch = roundup(xtiles, xalign) / compbits_per_byte; | 1003 | const int gridw = roundup(xtiles, xalign) / xalign; |
1001 | const int ytilesaligned = roundup(ytiles, yalign); | 1004 | const int gridh = roundup(ytiles, yalign) / yalign; |
1002 | const int gridw = roundup(tilepitch, wgx) / wgx; | ||
1003 | const int gridh = roundup(ytilesaligned, wgy) / wgy; | ||
1004 | 1005 | ||
1005 | if (!g->cde_app.initialised) { | 1006 | if (!g->cde_app.initialised) { |
1006 | err = gk20a_cde_reload(g); | 1007 | err = gk20a_cde_reload(g); |
@@ -1015,17 +1016,10 @@ static int gk20a_buffer_convert_gpu_to_cde( | |||
1015 | gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx", | 1016 | gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx", |
1016 | width, height, block_height_log2, compbits_offset); | 1017 | width, height, block_height_log2, compbits_offset); |
1017 | gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)", | 1018 | gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)", |
1018 | width, height, xtiles, ytiles, tilepitch, ytilesaligned); | 1019 | width, height, xtiles, ytiles, gridw*wgx, gridh*wgy); |
1019 | gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)", | 1020 | gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)", |
1020 | wgx, wgy, gridw, gridh); | 1021 | wgx, wgy, gridw, gridh); |
1021 | 1022 | ||
1022 | if (tilepitch % wgx != 0 || ytilesaligned % wgy != 0) { | ||
1023 | gk20a_warn(&g->dev->dev, | ||
1024 | "grid size (%d, %d) is not a multiple of work group size (%d, %d)", | ||
1025 | tilepitch, ytilesaligned, wgx, wgy); | ||
1026 | return -EINVAL; | ||
1027 | } | ||
1028 | |||
1029 | /* Write parameters */ | 1023 | /* Write parameters */ |
1030 | #define WRITE_PATCH(NAME, VALUE) \ | 1024 | #define WRITE_PATCH(NAME, VALUE) \ |
1031 | params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE} | 1025 | params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE} |