summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJussi Rasanen <jrasanen@nvidia.com>2014-10-02 12:12:51 -0400
committerDan Willemsen <dwillemsen@nvidia.com>2015-03-18 15:11:42 -0400
commit835be94af5ef74b6de972b2dea69c34a90b11d1b (patch)
treefe7f57fcacadd8805424ae5a693b85198e1319eb
parentcb5436867041930cd1f641da123a73b60a3da29b (diff)
gpu: nvgpu: cde: CDE optimizations
-Change cde_buf to use writecombined cpu mapping. -Since reading writecombined cpu data is still slow, avoid reads in gk20a_replace_data by checking whether a patch overwrites a whole word. -Remove unused distinction between src and dst buffers in cde_convert. -Remove cde debug dump code as it causes a perf hit. Bug 1546619 Change-Id: Ibd45d9c3a3dd3936184c2a2a0ba29e919569b328 Signed-off-by: Jussi Rasanen <jrasanen@nvidia.com> Reviewed-on: http://git-master/r/553233 Reviewed-by: Arto Merilainen <amerilainen@nvidia.com> Tested-by: Arto Merilainen <amerilainen@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/gk20a/cde_gk20a.c74
-rw-r--r--drivers/gpu/nvgpu/gk20a/cde_gk20a.h2
2 files changed, 22 insertions, 54 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index 4c33ea8d..b7264206 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -34,23 +34,6 @@
34#include "hw_ccsr_gk20a.h" 34#include "hw_ccsr_gk20a.h"
35#include "hw_pbdma_gk20a.h" 35#include "hw_pbdma_gk20a.h"
36 36
37void gk20a_cde_dump(struct gk20a_cde_ctx *cde_ctx)
38{
39 int i;
40 for (i = 0; i < cde_ctx->num_bufs; i++) {
41 struct gk20a_cde_mem_desc *target_mem = cde_ctx->mem + i;
42 u32 *target_mem_ptr = target_mem->cpuva;
43 int j = 0;
44
45 gk20a_dbg(gpu_dbg_cde, "cde: buffer=%d, size=%zu, gpuva=%llx\n",
46 i, target_mem->num_bytes, target_mem->gpu_va);
47
48 for (j = 0; j < target_mem->num_bytes / sizeof(u32); j++)
49 gk20a_dbg(gpu_dbg_cde, "0x%08x ", target_mem_ptr[j]);
50 gk20a_dbg(gpu_dbg_cde, "\n\n");
51 }
52}
53
54static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx) 37static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
55{ 38{
56 struct device *dev = &cde_ctx->pdev->dev; 39 struct device *dev = &cde_ctx->pdev->dev;
@@ -60,7 +43,8 @@ static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
60 struct gk20a_cde_mem_desc *mem = cde_ctx->mem + i; 43 struct gk20a_cde_mem_desc *mem = cde_ctx->mem + i;
61 gk20a_gmmu_unmap(cde_ctx->vm, mem->gpu_va, mem->num_bytes, 1); 44 gk20a_gmmu_unmap(cde_ctx->vm, mem->gpu_va, mem->num_bytes, 1);
62 gk20a_free_sgtable(&mem->sgt); 45 gk20a_free_sgtable(&mem->sgt);
63 dma_free_coherent(dev, mem->num_bytes, mem->cpuva, mem->iova); 46 dma_free_writecombine(dev, mem->num_bytes, mem->cpuva,
47 mem->iova);
64 } 48 }
65 49
66 for (i = 0; i < cde_ctx->num_obj_ids; i++) 50 for (i = 0; i < cde_ctx->num_obj_ids; i++)
@@ -140,7 +124,7 @@ static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
140 /* allocate buf */ 124 /* allocate buf */
141 mem = cde_ctx->mem + cde_ctx->num_bufs; 125 mem = cde_ctx->mem + cde_ctx->num_bufs;
142 mem->num_bytes = buf->num_bytes; 126 mem->num_bytes = buf->num_bytes;
143 mem->cpuva = dma_alloc_coherent(dev, mem->num_bytes, &mem->iova, 127 mem->cpuva = dma_alloc_writecombine(dev, mem->num_bytes, &mem->iova,
144 GFP_KERNEL); 128 GFP_KERNEL);
145 if (!mem->cpuva) { 129 if (!mem->cpuva) {
146 gk20a_warn(&cde_ctx->pdev->dev, "cde: could not allocate device memory. buffer idx = %d", 130 gk20a_warn(&cde_ctx->pdev->dev, "cde: could not allocate device memory. buffer idx = %d",
@@ -157,8 +141,9 @@ static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
157 goto err_get_sgtable; 141 goto err_get_sgtable;
158 } 142 }
159 143
160 mem->gpu_va = gk20a_gmmu_map(cde_ctx->vm, &mem->sgt, mem->num_bytes, 0, 144 mem->gpu_va = gk20a_gmmu_map(cde_ctx->vm, &mem->sgt, mem->num_bytes,
161 gk20a_mem_flag_none); 145 0,
146 gk20a_mem_flag_none);
162 if (!mem->gpu_va) { 147 if (!mem->gpu_va) {
163 gk20a_warn(&cde_ctx->pdev->dev, "cde: could not map buffer to gpuva. buffer idx = %d", 148 gk20a_warn(&cde_ctx->pdev->dev, "cde: could not map buffer to gpuva. buffer idx = %d",
164 cde_ctx->num_bufs); 149 cde_ctx->num_bufs);
@@ -179,7 +164,7 @@ err_map_buffer:
179 gk20a_free_sgtable(&mem->sgt); 164 gk20a_free_sgtable(&mem->sgt);
180 kfree(mem->sgt); 165 kfree(mem->sgt);
181err_get_sgtable: 166err_get_sgtable:
182 dma_free_coherent(dev, mem->num_bytes, &mem->cpuva, mem->iova); 167 dma_free_writecombine(dev, mem->num_bytes, &mem->cpuva, mem->iova);
183 return err; 168 return err;
184} 169}
185 170
@@ -194,11 +179,14 @@ static int gk20a_replace_data(struct gk20a_cde_ctx *cde_ctx, void *target,
194 value &= mask; 179 value &= mask;
195 180
196 /* read current data from the location */ 181 /* read current data from the location */
197 if (type == TYPE_PARAM_TYPE_U32) 182 current_value = 0;
198 current_value = *target_mem_ptr; 183 if (type == TYPE_PARAM_TYPE_U32) {
199 else if (type == TYPE_PARAM_TYPE_U64_LITTLE) 184 if (mask != 0xfffffffful)
200 current_value = *target_mem_ptr_u64; 185 current_value = *target_mem_ptr;
201 else if (type == TYPE_PARAM_TYPE_U64_BIG) { 186 } else if (type == TYPE_PARAM_TYPE_U64_LITTLE) {
187 if (mask != ~0ul)
188 current_value = *target_mem_ptr_u64;
189 } else if (type == TYPE_PARAM_TYPE_U64_BIG) {
202 current_value = *target_mem_ptr_u64; 190 current_value = *target_mem_ptr_u64;
203 current_value = (u64)(current_value >> 32) | 191 current_value = (u64)(current_value >> 32) |
204 (u64)(current_value << 32); 192 (u64)(current_value << 32);
@@ -601,7 +589,7 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
601 num_entries, flags, fence, fence_out); 589 num_entries, flags, fence, fence_out);
602} 590}
603 591
604int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, 592int gk20a_cde_convert(struct gk20a *g,
605 struct dma_buf *dst, 593 struct dma_buf *dst,
606 s32 dst_kind, u64 dst_byte_offset, 594 s32 dst_kind, u64 dst_byte_offset,
607 u32 dst_size, struct nvgpu_fence *fence, 595 u32 dst_size, struct nvgpu_fence *fence,
@@ -611,7 +599,7 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
611 struct gk20a_cde_app *cde_app = &g->cde_app; 599 struct gk20a_cde_app *cde_app = &g->cde_app;
612 struct gk20a_comptags comptags; 600 struct gk20a_comptags comptags;
613 struct gk20a_cde_ctx *cde_ctx; 601 struct gk20a_cde_ctx *cde_ctx;
614 u64 dst_vaddr = 0, src_vaddr = 0; 602 u64 dst_vaddr = 0;
615 u32 flags; 603 u32 flags;
616 int err, i; 604 int err, i;
617 605
@@ -647,30 +635,13 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
647 goto exit_unlock; 635 goto exit_unlock;
648 } 636 }
649 637
650 /* ensure that the src buffer has drvdata */
651 err = gk20a_dmabuf_alloc_drvdata(src, &g->dev->dev);
652 if (err)
653 goto exit_unlock;
654
655 /* map the source buffer to prevent premature release */
656 get_dma_buf(src); /* a ref for gk20a_vm_map */
657 src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0,
658 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
659 dst_kind, NULL, true,
660 gk20a_mem_flag_none,
661 0, 0);
662 if (!src_vaddr) {
663 dma_buf_put(src);
664 err = -EINVAL;
665 goto exit_unlock;
666 }
667
668 if (!dst_size) 638 if (!dst_size)
669 dst_size = dst->size - dst_byte_offset; 639 dst_size = dst->size - dst_byte_offset;
670 640
671 /* reload buffer converter if it has failed */ 641 /* reload buffer converter if it has failed */
672 if (cde_ctx->ch->has_timedout) { 642 if (cde_ctx->ch->has_timedout) {
673 mutex_unlock(&cde_app->mutex); 643 mutex_unlock(&cde_app->mutex);
644 gk20a_warn(&cde_ctx->pdev->dev, "cde: had timed out, reloading");
674 err = gk20a_cde_reload(g); 645 err = gk20a_cde_reload(g);
675 if (err) 646 if (err)
676 return err; 647 return err;
@@ -685,8 +656,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
685 } 656 }
686 657
687 /* store source buffer compression tags */ 658 /* store source buffer compression tags */
688 gk20a_get_comptags(&g->dev->dev, src, &comptags); 659 gk20a_get_comptags(&g->dev->dev, dst, &comptags);
689 cde_ctx->src_vaddr = src_vaddr; 660 cde_ctx->src_vaddr = dst_vaddr;
690 cde_ctx->src_param_offset = comptags.offset; 661 cde_ctx->src_param_offset = comptags.offset;
691 cde_ctx->src_param_lines = comptags.lines; 662 cde_ctx->src_param_lines = comptags.lines;
692 663
@@ -722,7 +693,6 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
722 g->gr.compbit_store.size, cde_ctx->backing_store_vaddr); 693 g->gr.compbit_store.size, cde_ctx->backing_store_vaddr);
723 gk20a_dbg(gpu_dbg_cde, "cde: buffer=dst, size=%llu, gpuva=%llx\n", 694 gk20a_dbg(gpu_dbg_cde, "cde: buffer=dst, size=%llu, gpuva=%llx\n",
724 cde_ctx->dest_size, cde_ctx->dest_vaddr); 695 cde_ctx->dest_size, cde_ctx->dest_vaddr);
725 gk20a_cde_dump(cde_ctx);
726 696
727 /* execute the init push buffer */ 697 /* execute the init push buffer */
728 if (!cde_ctx->init_cmd_executed) { 698 if (!cde_ctx->init_cmd_executed) {
@@ -747,8 +717,6 @@ exit_unlock:
747 /* unmap the buffers - channel holds references to them now */ 717 /* unmap the buffers - channel holds references to them now */
748 if (dst_vaddr) 718 if (dst_vaddr)
749 gk20a_vm_unmap(g->cde_app.vm, dst_vaddr); 719 gk20a_vm_unmap(g->cde_app.vm, dst_vaddr);
750 if (src_vaddr)
751 gk20a_vm_unmap(g->cde_app.vm, src_vaddr);
752 720
753 mutex_unlock(&cde_app->mutex); 721 mutex_unlock(&cde_app->mutex);
754 722
@@ -1054,7 +1022,7 @@ static int gk20a_buffer_convert_gpu_to_cde(
1054 err = gk20a_init_cde_support(g); 1022 err = gk20a_init_cde_support(g);
1055 if (err) 1023 if (err)
1056 goto out; 1024 goto out;
1057 err = gk20a_cde_convert(g, dmabuf, dmabuf, 1025 err = gk20a_cde_convert(g, dmabuf,
1058 0, /* dst kind */ 1026 0, /* dst kind */
1059 compbits_offset, 1027 compbits_offset,
1060 0, /* dst_size, 0 = auto */ 1028 0, /* dst_size, 0 = auto */
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
index c427d4db..b93e56c8 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
@@ -258,7 +258,7 @@ struct gk20a_cde_app {
258int gk20a_cde_destroy(struct gk20a *g); 258int gk20a_cde_destroy(struct gk20a *g);
259int gk20a_init_cde_support(struct gk20a *g); 259int gk20a_init_cde_support(struct gk20a *g);
260int gk20a_cde_reload(struct gk20a *g); 260int gk20a_cde_reload(struct gk20a *g);
261int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, struct dma_buf *dst, 261int gk20a_cde_convert(struct gk20a *g, struct dma_buf *dst,
262 s32 dst_kind, u64 dst_word_offset, 262 s32 dst_kind, u64 dst_word_offset,
263 u32 dst_size, struct nvgpu_fence *fence, 263 u32 dst_size, struct nvgpu_fence *fence,
264 u32 __flags, struct gk20a_cde_param *params, 264 u32 __flags, struct gk20a_cde_param *params,