diff options
author | Jussi Rasanen <jrasanen@nvidia.com> | 2014-10-02 12:12:51 -0400 |
---|---|---|
committer | Dan Willemsen <dwillemsen@nvidia.com> | 2015-03-18 15:11:42 -0400 |
commit | 835be94af5ef74b6de972b2dea69c34a90b11d1b (patch) | |
tree | fe7f57fcacadd8805424ae5a693b85198e1319eb /drivers/gpu | |
parent | cb5436867041930cd1f641da123a73b60a3da29b (diff) |
gpu: nvgpu: cde: CDE optimizations
-Change cde_buf to use writecombined cpu mapping.
-Since reading writecombined cpu data is still slow, avoid reads in
gk20a_replace_data by checking whether a patch overwrites a whole
word.
-Remove unused distinction between src and dst buffers in cde_convert.
-Remove cde debug dump code as it causes a perf hit.
Bug 1546619
Change-Id: Ibd45d9c3a3dd3936184c2a2a0ba29e919569b328
Signed-off-by: Jussi Rasanen <jrasanen@nvidia.com>
Reviewed-on: http://git-master/r/553233
Reviewed-by: Arto Merilainen <amerilainen@nvidia.com>
Tested-by: Arto Merilainen <amerilainen@nvidia.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/cde_gk20a.c | 74 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/cde_gk20a.h | 2 |
2 files changed, 22 insertions, 54 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c index 4c33ea8d..b7264206 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c | |||
@@ -34,23 +34,6 @@ | |||
34 | #include "hw_ccsr_gk20a.h" | 34 | #include "hw_ccsr_gk20a.h" |
35 | #include "hw_pbdma_gk20a.h" | 35 | #include "hw_pbdma_gk20a.h" |
36 | 36 | ||
37 | void gk20a_cde_dump(struct gk20a_cde_ctx *cde_ctx) | ||
38 | { | ||
39 | int i; | ||
40 | for (i = 0; i < cde_ctx->num_bufs; i++) { | ||
41 | struct gk20a_cde_mem_desc *target_mem = cde_ctx->mem + i; | ||
42 | u32 *target_mem_ptr = target_mem->cpuva; | ||
43 | int j = 0; | ||
44 | |||
45 | gk20a_dbg(gpu_dbg_cde, "cde: buffer=%d, size=%zu, gpuva=%llx\n", | ||
46 | i, target_mem->num_bytes, target_mem->gpu_va); | ||
47 | |||
48 | for (j = 0; j < target_mem->num_bytes / sizeof(u32); j++) | ||
49 | gk20a_dbg(gpu_dbg_cde, "0x%08x ", target_mem_ptr[j]); | ||
50 | gk20a_dbg(gpu_dbg_cde, "\n\n"); | ||
51 | } | ||
52 | } | ||
53 | |||
54 | static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx) | 37 | static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx) |
55 | { | 38 | { |
56 | struct device *dev = &cde_ctx->pdev->dev; | 39 | struct device *dev = &cde_ctx->pdev->dev; |
@@ -60,7 +43,8 @@ static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx) | |||
60 | struct gk20a_cde_mem_desc *mem = cde_ctx->mem + i; | 43 | struct gk20a_cde_mem_desc *mem = cde_ctx->mem + i; |
61 | gk20a_gmmu_unmap(cde_ctx->vm, mem->gpu_va, mem->num_bytes, 1); | 44 | gk20a_gmmu_unmap(cde_ctx->vm, mem->gpu_va, mem->num_bytes, 1); |
62 | gk20a_free_sgtable(&mem->sgt); | 45 | gk20a_free_sgtable(&mem->sgt); |
63 | dma_free_coherent(dev, mem->num_bytes, mem->cpuva, mem->iova); | 46 | dma_free_writecombine(dev, mem->num_bytes, mem->cpuva, |
47 | mem->iova); | ||
64 | } | 48 | } |
65 | 49 | ||
66 | for (i = 0; i < cde_ctx->num_obj_ids; i++) | 50 | for (i = 0; i < cde_ctx->num_obj_ids; i++) |
@@ -140,7 +124,7 @@ static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx, | |||
140 | /* allocate buf */ | 124 | /* allocate buf */ |
141 | mem = cde_ctx->mem + cde_ctx->num_bufs; | 125 | mem = cde_ctx->mem + cde_ctx->num_bufs; |
142 | mem->num_bytes = buf->num_bytes; | 126 | mem->num_bytes = buf->num_bytes; |
143 | mem->cpuva = dma_alloc_coherent(dev, mem->num_bytes, &mem->iova, | 127 | mem->cpuva = dma_alloc_writecombine(dev, mem->num_bytes, &mem->iova, |
144 | GFP_KERNEL); | 128 | GFP_KERNEL); |
145 | if (!mem->cpuva) { | 129 | if (!mem->cpuva) { |
146 | gk20a_warn(&cde_ctx->pdev->dev, "cde: could not allocate device memory. buffer idx = %d", | 130 | gk20a_warn(&cde_ctx->pdev->dev, "cde: could not allocate device memory. buffer idx = %d", |
@@ -157,8 +141,9 @@ static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx, | |||
157 | goto err_get_sgtable; | 141 | goto err_get_sgtable; |
158 | } | 142 | } |
159 | 143 | ||
160 | mem->gpu_va = gk20a_gmmu_map(cde_ctx->vm, &mem->sgt, mem->num_bytes, 0, | 144 | mem->gpu_va = gk20a_gmmu_map(cde_ctx->vm, &mem->sgt, mem->num_bytes, |
161 | gk20a_mem_flag_none); | 145 | 0, |
146 | gk20a_mem_flag_none); | ||
162 | if (!mem->gpu_va) { | 147 | if (!mem->gpu_va) { |
163 | gk20a_warn(&cde_ctx->pdev->dev, "cde: could not map buffer to gpuva. buffer idx = %d", | 148 | gk20a_warn(&cde_ctx->pdev->dev, "cde: could not map buffer to gpuva. buffer idx = %d", |
164 | cde_ctx->num_bufs); | 149 | cde_ctx->num_bufs); |
@@ -179,7 +164,7 @@ err_map_buffer: | |||
179 | gk20a_free_sgtable(&mem->sgt); | 164 | gk20a_free_sgtable(&mem->sgt); |
180 | kfree(mem->sgt); | 165 | kfree(mem->sgt); |
181 | err_get_sgtable: | 166 | err_get_sgtable: |
182 | dma_free_coherent(dev, mem->num_bytes, &mem->cpuva, mem->iova); | 167 | dma_free_writecombine(dev, mem->num_bytes, &mem->cpuva, mem->iova); |
183 | return err; | 168 | return err; |
184 | } | 169 | } |
185 | 170 | ||
@@ -194,11 +179,14 @@ static int gk20a_replace_data(struct gk20a_cde_ctx *cde_ctx, void *target, | |||
194 | value &= mask; | 179 | value &= mask; |
195 | 180 | ||
196 | /* read current data from the location */ | 181 | /* read current data from the location */ |
197 | if (type == TYPE_PARAM_TYPE_U32) | 182 | current_value = 0; |
198 | current_value = *target_mem_ptr; | 183 | if (type == TYPE_PARAM_TYPE_U32) { |
199 | else if (type == TYPE_PARAM_TYPE_U64_LITTLE) | 184 | if (mask != 0xfffffffful) |
200 | current_value = *target_mem_ptr_u64; | 185 | current_value = *target_mem_ptr; |
201 | else if (type == TYPE_PARAM_TYPE_U64_BIG) { | 186 | } else if (type == TYPE_PARAM_TYPE_U64_LITTLE) { |
187 | if (mask != ~0ul) | ||
188 | current_value = *target_mem_ptr_u64; | ||
189 | } else if (type == TYPE_PARAM_TYPE_U64_BIG) { | ||
202 | current_value = *target_mem_ptr_u64; | 190 | current_value = *target_mem_ptr_u64; |
203 | current_value = (u64)(current_value >> 32) | | 191 | current_value = (u64)(current_value >> 32) | |
204 | (u64)(current_value << 32); | 192 | (u64)(current_value << 32); |
@@ -601,7 +589,7 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx, | |||
601 | num_entries, flags, fence, fence_out); | 589 | num_entries, flags, fence, fence_out); |
602 | } | 590 | } |
603 | 591 | ||
604 | int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, | 592 | int gk20a_cde_convert(struct gk20a *g, |
605 | struct dma_buf *dst, | 593 | struct dma_buf *dst, |
606 | s32 dst_kind, u64 dst_byte_offset, | 594 | s32 dst_kind, u64 dst_byte_offset, |
607 | u32 dst_size, struct nvgpu_fence *fence, | 595 | u32 dst_size, struct nvgpu_fence *fence, |
@@ -611,7 +599,7 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, | |||
611 | struct gk20a_cde_app *cde_app = &g->cde_app; | 599 | struct gk20a_cde_app *cde_app = &g->cde_app; |
612 | struct gk20a_comptags comptags; | 600 | struct gk20a_comptags comptags; |
613 | struct gk20a_cde_ctx *cde_ctx; | 601 | struct gk20a_cde_ctx *cde_ctx; |
614 | u64 dst_vaddr = 0, src_vaddr = 0; | 602 | u64 dst_vaddr = 0; |
615 | u32 flags; | 603 | u32 flags; |
616 | int err, i; | 604 | int err, i; |
617 | 605 | ||
@@ -647,30 +635,13 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, | |||
647 | goto exit_unlock; | 635 | goto exit_unlock; |
648 | } | 636 | } |
649 | 637 | ||
650 | /* ensure that the src buffer has drvdata */ | ||
651 | err = gk20a_dmabuf_alloc_drvdata(src, &g->dev->dev); | ||
652 | if (err) | ||
653 | goto exit_unlock; | ||
654 | |||
655 | /* map the source buffer to prevent premature release */ | ||
656 | get_dma_buf(src); /* a ref for gk20a_vm_map */ | ||
657 | src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0, | ||
658 | NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, | ||
659 | dst_kind, NULL, true, | ||
660 | gk20a_mem_flag_none, | ||
661 | 0, 0); | ||
662 | if (!src_vaddr) { | ||
663 | dma_buf_put(src); | ||
664 | err = -EINVAL; | ||
665 | goto exit_unlock; | ||
666 | } | ||
667 | |||
668 | if (!dst_size) | 638 | if (!dst_size) |
669 | dst_size = dst->size - dst_byte_offset; | 639 | dst_size = dst->size - dst_byte_offset; |
670 | 640 | ||
671 | /* reload buffer converter if it has failed */ | 641 | /* reload buffer converter if it has failed */ |
672 | if (cde_ctx->ch->has_timedout) { | 642 | if (cde_ctx->ch->has_timedout) { |
673 | mutex_unlock(&cde_app->mutex); | 643 | mutex_unlock(&cde_app->mutex); |
644 | gk20a_warn(&cde_ctx->pdev->dev, "cde: had timed out, reloading"); | ||
674 | err = gk20a_cde_reload(g); | 645 | err = gk20a_cde_reload(g); |
675 | if (err) | 646 | if (err) |
676 | return err; | 647 | return err; |
@@ -685,8 +656,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, | |||
685 | } | 656 | } |
686 | 657 | ||
687 | /* store source buffer compression tags */ | 658 | /* store source buffer compression tags */ |
688 | gk20a_get_comptags(&g->dev->dev, src, &comptags); | 659 | gk20a_get_comptags(&g->dev->dev, dst, &comptags); |
689 | cde_ctx->src_vaddr = src_vaddr; | 660 | cde_ctx->src_vaddr = dst_vaddr; |
690 | cde_ctx->src_param_offset = comptags.offset; | 661 | cde_ctx->src_param_offset = comptags.offset; |
691 | cde_ctx->src_param_lines = comptags.lines; | 662 | cde_ctx->src_param_lines = comptags.lines; |
692 | 663 | ||
@@ -722,7 +693,6 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, | |||
722 | g->gr.compbit_store.size, cde_ctx->backing_store_vaddr); | 693 | g->gr.compbit_store.size, cde_ctx->backing_store_vaddr); |
723 | gk20a_dbg(gpu_dbg_cde, "cde: buffer=dst, size=%llu, gpuva=%llx\n", | 694 | gk20a_dbg(gpu_dbg_cde, "cde: buffer=dst, size=%llu, gpuva=%llx\n", |
724 | cde_ctx->dest_size, cde_ctx->dest_vaddr); | 695 | cde_ctx->dest_size, cde_ctx->dest_vaddr); |
725 | gk20a_cde_dump(cde_ctx); | ||
726 | 696 | ||
727 | /* execute the init push buffer */ | 697 | /* execute the init push buffer */ |
728 | if (!cde_ctx->init_cmd_executed) { | 698 | if (!cde_ctx->init_cmd_executed) { |
@@ -747,8 +717,6 @@ exit_unlock: | |||
747 | /* unmap the buffers - channel holds references to them now */ | 717 | /* unmap the buffers - channel holds references to them now */ |
748 | if (dst_vaddr) | 718 | if (dst_vaddr) |
749 | gk20a_vm_unmap(g->cde_app.vm, dst_vaddr); | 719 | gk20a_vm_unmap(g->cde_app.vm, dst_vaddr); |
750 | if (src_vaddr) | ||
751 | gk20a_vm_unmap(g->cde_app.vm, src_vaddr); | ||
752 | 720 | ||
753 | mutex_unlock(&cde_app->mutex); | 721 | mutex_unlock(&cde_app->mutex); |
754 | 722 | ||
@@ -1054,7 +1022,7 @@ static int gk20a_buffer_convert_gpu_to_cde( | |||
1054 | err = gk20a_init_cde_support(g); | 1022 | err = gk20a_init_cde_support(g); |
1055 | if (err) | 1023 | if (err) |
1056 | goto out; | 1024 | goto out; |
1057 | err = gk20a_cde_convert(g, dmabuf, dmabuf, | 1025 | err = gk20a_cde_convert(g, dmabuf, |
1058 | 0, /* dst kind */ | 1026 | 0, /* dst kind */ |
1059 | compbits_offset, | 1027 | compbits_offset, |
1060 | 0, /* dst_size, 0 = auto */ | 1028 | 0, /* dst_size, 0 = auto */ |
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h index c427d4db..b93e56c8 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h | |||
@@ -258,7 +258,7 @@ struct gk20a_cde_app { | |||
258 | int gk20a_cde_destroy(struct gk20a *g); | 258 | int gk20a_cde_destroy(struct gk20a *g); |
259 | int gk20a_init_cde_support(struct gk20a *g); | 259 | int gk20a_init_cde_support(struct gk20a *g); |
260 | int gk20a_cde_reload(struct gk20a *g); | 260 | int gk20a_cde_reload(struct gk20a *g); |
261 | int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, struct dma_buf *dst, | 261 | int gk20a_cde_convert(struct gk20a *g, struct dma_buf *dst, |
262 | s32 dst_kind, u64 dst_word_offset, | 262 | s32 dst_kind, u64 dst_word_offset, |
263 | u32 dst_size, struct nvgpu_fence *fence, | 263 | u32 dst_size, struct nvgpu_fence *fence, |
264 | u32 __flags, struct gk20a_cde_param *params, | 264 | u32 __flags, struct gk20a_cde_param *params, |