summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
diff options
context:
space:
mode:
authorDeepak Nibade <dnibade@nvidia.com>2016-08-04 10:26:42 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2016-09-01 12:10:00 -0400
commitc845b210129a4a2ebd8a3cd22c53dc30cad3664d (patch)
tree41c60205ce8927ef9062aa4a257fd9d8bdf8a4d5 /drivers/gpu/nvgpu/gk20a/mm_gk20a.c
parentc38cc24e1a752d6eb5b07d771ddbf6ab700f695d (diff)
gpu: nvgpu: support GMMU mappings for vidmem page allocator
Switch to use page allocator for vidmem Support GMMU mappings for page (non-contiguous page allocator) in update_gmmu_ptes_locked() If aperture is VIDMEM, traverse each chunk in an allocation and map it to GPU VA separately Fix CE page clearing to support page allocator Fix gk20a_pramin_enter() to get base address from new allocator Define API gk20a_mem_get_vidmem_addr() to get base address of allocation. Note that this API should not be used if we have more than 1 chunk Jira DNVGPU-96 Change-Id: I725422f3538aeb477ca4220ba57ef8b3c53db703 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/1199177 (cherry picked from commit 1afae6ee6529ab88cedd5bcbe458fbdc0d4b1fd8) Reviewed-on: http://git-master/r/1197647 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/mm_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c210
1 files changed, 149 insertions, 61 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 7c731890..dde798cf 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -29,6 +29,7 @@
29#include <linux/lcm.h> 29#include <linux/lcm.h>
30#include <uapi/linux/nvgpu.h> 30#include <uapi/linux/nvgpu.h>
31#include <trace/events/gk20a.h> 31#include <trace/events/gk20a.h>
32#include <gk20a/page_allocator_priv.h>
32 33
33#include "gk20a.h" 34#include "gk20a.h"
34#include "mm_gk20a.h" 35#include "mm_gk20a.h"
@@ -84,10 +85,31 @@ void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem)
84 mem->cpu_va = NULL; 85 mem->cpu_va = NULL;
85} 86}
86 87
88static u64 gk20a_mem_get_vidmem_addr(struct gk20a *g, struct mem_desc *mem)
89{
90 struct gk20a_page_alloc *alloc;
91 struct page_alloc_chunk *chunk;
92
93 if (mem && mem->aperture == APERTURE_VIDMEM) {
94 alloc = (struct gk20a_page_alloc *)
95 sg_dma_address(mem->sgt->sgl);
96
97 /* This API should not be used with > 1 chunks */
98 if (alloc->nr_chunks != 1)
99 return 0;
100
101 chunk = list_first_entry(&alloc->alloc_chunks,
102 struct page_alloc_chunk, list_entry);
103 return chunk->base;
104 }
105
106 return 0;
107}
108
87/* WARNING: returns pramin_window_lock taken, complement with pramin_exit() */ 109/* WARNING: returns pramin_window_lock taken, complement with pramin_exit() */
88static u32 gk20a_pramin_enter(struct gk20a *g, struct mem_desc *mem, u32 w) 110static u32 gk20a_pramin_enter(struct gk20a *g, struct mem_desc *mem, u32 w)
89{ 111{
90 u64 bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0); 112 u64 bufbase = gk20a_mem_get_vidmem_addr(g, mem);
91 u64 addr = bufbase + w * sizeof(u32); 113 u64 addr = bufbase + w * sizeof(u32);
92 u32 hi = (u32)((addr & ~(u64)0xfffff) 114 u32 hi = (u32)((addr & ~(u64)0xfffff)
93 >> bus_bar0_window_target_bar0_window_base_shift_v()); 115 >> bus_bar0_window_target_bar0_window_base_shift_v());
@@ -765,9 +787,7 @@ static int gk20a_init_vidmem(struct mm_gk20a *mm)
765 return 0; 787 return 0;
766 788
767 err = gk20a_page_allocator_init(&g->mm.vidmem.allocator, "vidmem", 789 err = gk20a_page_allocator_init(&g->mm.vidmem.allocator, "vidmem",
768 SZ_4K, size - SZ_4K, SZ_4K, 790 SZ_4K, size - SZ_4K, SZ_4K, 0);
769 GPU_ALLOC_FORCE_CONTIG |
770 GPU_ALLOC_NO_SCATTER_GATHER);
771 if (err) { 791 if (err) {
772 gk20a_err(d, "Failed to register vidmem for size %zu: %d", 792 gk20a_err(d, "Failed to register vidmem for size %zu: %d",
773 size, err); 793 size, err);
@@ -2721,7 +2741,6 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
2721#if defined(CONFIG_GK20A_VIDMEM) 2741#if defined(CONFIG_GK20A_VIDMEM)
2722 u64 addr; 2742 u64 addr;
2723 int err; 2743 int err;
2724 bool need_pramin_access = true;
2725 2744
2726 gk20a_dbg_fn(""); 2745 gk20a_dbg_fn("");
2727 2746
@@ -2764,13 +2783,22 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
2764 2783
2765 if (g->mm.vidmem.ce_ctx_id != ~0) { 2784 if (g->mm.vidmem.ce_ctx_id != ~0) {
2766 struct gk20a_fence *gk20a_fence_out = NULL; 2785 struct gk20a_fence *gk20a_fence_out = NULL;
2767 u64 dst_bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0); 2786 struct gk20a_fence *gk20a_last_fence = NULL;
2787 struct gk20a_page_alloc *alloc = NULL;
2788 struct page_alloc_chunk *chunk = NULL;
2768 2789
2769 err = gk20a_ce_execute_ops(g->dev, 2790 alloc = (struct gk20a_page_alloc *)
2791 g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
2792
2793 list_for_each_entry(chunk, &alloc->alloc_chunks, list_entry) {
2794 if (gk20a_last_fence)
2795 gk20a_fence_put(gk20a_last_fence);
2796
2797 err = gk20a_ce_execute_ops(g->dev,
2770 g->mm.vidmem.ce_ctx_id, 2798 g->mm.vidmem.ce_ctx_id,
2771 0, 2799 0,
2772 dst_bufbase, 2800 chunk->base,
2773 (u64)size, 2801 chunk->length,
2774 0x00000000, 2802 0x00000000,
2775 NVGPU_CE_DST_LOCATION_LOCAL_FB, 2803 NVGPU_CE_DST_LOCATION_LOCAL_FB,
2776 NVGPU_CE_MEMSET, 2804 NVGPU_CE_MEMSET,
@@ -2778,27 +2806,31 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
2778 0, 2806 0,
2779 &gk20a_fence_out); 2807 &gk20a_fence_out);
2780 2808
2781 if (!err) { 2809 if (err) {
2782 if (gk20a_fence_out) { 2810 gk20a_err(g->dev,
2783 err = gk20a_fence_wait(gk20a_fence_out, gk20a_get_gr_idle_timeout(g)); 2811 "Failed gk20a_ce_execute_ops[%d]", err);
2784 gk20a_fence_put(gk20a_fence_out); 2812 goto fail_free_table;
2785 if (err)
2786 gk20a_err(g->dev,
2787 "Failed to get the fence_out from CE execute ops");
2788 else
2789 need_pramin_access = false;
2790 } 2813 }
2791 } else
2792 gk20a_err(g->dev, "Failed gk20a_ce_execute_ops[%d]",err);
2793 }
2794 2814
2795 if (need_pramin_access) 2815 gk20a_last_fence = gk20a_fence_out;
2796 gk20a_memset(g, mem, 0, 0, size); 2816 }
2817
2818 if (gk20a_last_fence) {
2819 err = gk20a_fence_wait(gk20a_last_fence,
2820 gk20a_get_gr_idle_timeout(g));
2821 gk20a_fence_put(gk20a_last_fence);
2822 if (err)
2823 gk20a_err(g->dev,
2824 "Failed to get the fence_out from CE execute ops");
2825 }
2826 }
2797 2827
2798 gk20a_dbg_fn("done at 0x%llx size %zu", addr, size); 2828 gk20a_dbg_fn("done at 0x%llx size %zu", addr, size);
2799 2829
2800 return 0; 2830 return 0;
2801 2831
2832fail_free_table:
2833 sg_free_table(mem->sgt);
2802fail_kfree: 2834fail_kfree:
2803 kfree(mem->sgt); 2835 kfree(mem->sgt);
2804fail_physfree: 2836fail_physfree:
@@ -3381,13 +3413,9 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
3381 u32 page_size = vm->gmmu_page_sizes[pgsz_idx]; 3413 u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
3382 int err; 3414 int err;
3383 struct scatterlist *sgl = NULL; 3415 struct scatterlist *sgl = NULL;
3384 3416 struct gk20a_page_alloc *alloc = NULL;
3385 gk20a_dbg(gpu_dbg_pte, "size_idx=%d, iova=%llx, buffer offset %lld, nents %d", 3417 struct page_alloc_chunk *chunk = NULL;
3386 pgsz_idx, 3418 u64 length;
3387 sgt ? g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0)
3388 : 0ULL,
3389 buffer_offset,
3390 sgt ? sgt->nents : 0);
3391 3419
3392 /* note: here we need to map kernel to small, since the 3420 /* note: here we need to map kernel to small, since the
3393 * low-level mmu code assumes 0 is small and 1 is big pages */ 3421 * low-level mmu code assumes 0 is small and 1 is big pages */
@@ -3397,45 +3425,105 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
3397 if (space_to_skip & (page_size - 1)) 3425 if (space_to_skip & (page_size - 1))
3398 return -EINVAL; 3426 return -EINVAL;
3399 3427
3400 if (sgt) { 3428 err = map_gmmu_pages(g, &vm->pdb);
3401 iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0); 3429 if (err) {
3402 if (!vm->mm->bypass_smmu && iova) { 3430 gk20a_err(dev_from_vm(vm),
3403 iova += space_to_skip; 3431 "couldn't map ptes for update as=%d",
3432 vm_aspace_id(vm));
3433 return err;
3434 }
3435
3436 if (aperture == APERTURE_VIDMEM) {
3437 gk20a_dbg(gpu_dbg_map_v, "vidmem map size_idx=%d, gpu_va=[%llx,%llx], alloc=%llx",
3438 pgsz_idx, gpu_va, gpu_end-1, iova);
3439
3440 if (sgt) {
3441 alloc = (struct gk20a_page_alloc *)
3442 g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
3443
3444 list_for_each_entry(chunk, &alloc->alloc_chunks,
3445 list_entry) {
3446 if (space_to_skip &&
3447 space_to_skip > chunk->length) {
3448 space_to_skip -= chunk->length;
3449 } else {
3450 iova = chunk->base + space_to_skip;
3451 length = chunk->length - space_to_skip;
3452 space_to_skip = 0;
3453
3454 err = update_gmmu_level_locked(vm,
3455 &vm->pdb, pgsz_idx,
3456 &sgl,
3457 &space_to_skip,
3458 &iova,
3459 gpu_va, gpu_va + length,
3460 kind_v, &ctag,
3461 cacheable, unmapped_pte,
3462 rw_flag, sparse, 0, priv,
3463 aperture);
3464
3465 /* need to set explicit zero here */
3466 space_to_skip = 0;
3467 gpu_va += length;
3468 }
3469 }
3404 } else { 3470 } else {
3405 sgl = sgt->sgl; 3471 err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
3406 3472 &sgl,
3407 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", 3473 &space_to_skip,
3408 (u64)sg_phys(sgl), 3474 &iova,
3409 sgl->length); 3475 gpu_va, gpu_end,
3410 while (space_to_skip && sgl && 3476 kind_v, &ctag,
3411 space_to_skip + page_size > sgl->length) { 3477 cacheable, unmapped_pte, rw_flag,
3412 space_to_skip -= sgl->length; 3478 sparse, 0, priv,
3413 sgl = sg_next(sgl); 3479 aperture);
3480 }
3481 } else {
3482 gk20a_dbg(gpu_dbg_pte, "size_idx=%d, iova=%llx, buffer offset %lld, nents %d",
3483 pgsz_idx,
3484 sgt ? g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0)
3485 : 0ULL,
3486 buffer_offset,
3487 sgt ? sgt->nents : 0);
3488
3489 gk20a_dbg(gpu_dbg_map_v, "size_idx=%d, gpu_va=[%llx,%llx], iova=%llx",
3490 pgsz_idx, gpu_va, gpu_end-1, iova);
3491
3492 if (sgt) {
3493 iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
3494 if (!vm->mm->bypass_smmu && iova) {
3495 iova += space_to_skip;
3496 } else {
3497 sgl = sgt->sgl;
3498
3414 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", 3499 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
3415 (u64)sg_phys(sgl), 3500 (u64)sg_phys(sgl),
3416 sgl->length); 3501 sgl->length);
3502
3503 while (space_to_skip && sgl &&
3504 space_to_skip + page_size > sgl->length) {
3505 space_to_skip -= sgl->length;
3506 sgl = sg_next(sgl);
3507 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
3508 (u64)sg_phys(sgl),
3509 sgl->length);
3510 }
3511
3512 iova = sg_phys(sgl) + space_to_skip;
3417 } 3513 }
3418 iova = sg_phys(sgl) + space_to_skip;
3419 } 3514 }
3420 }
3421 3515
3422 gk20a_dbg(gpu_dbg_map_v, "size_idx=%d, gpu_va=[%llx,%llx], iova=%llx", 3516 err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
3423 pgsz_idx, gpu_va, gpu_end-1, iova); 3517 &sgl,
3424 err = map_gmmu_pages(g, &vm->pdb); 3518 &space_to_skip,
3425 if (err) { 3519 &iova,
3426 gk20a_err(dev_from_vm(vm), 3520 gpu_va, gpu_end,
3427 "couldn't map ptes for update as=%d", 3521 kind_v, &ctag,
3428 vm_aspace_id(vm)); 3522 cacheable, unmapped_pte, rw_flag,
3429 return err; 3523 sparse, 0, priv,
3524 aperture);
3430 } 3525 }
3431 err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx, 3526
3432 &sgl,
3433 &space_to_skip,
3434 &iova,
3435 gpu_va, gpu_end,
3436 kind_v, &ctag,
3437 cacheable, unmapped_pte, rw_flag, sparse, 0, priv,
3438 aperture);
3439 unmap_gmmu_pages(g, &vm->pdb); 3527 unmap_gmmu_pages(g, &vm->pdb);
3440 3528
3441 smp_mb(); 3529 smp_mb();