From c845b210129a4a2ebd8a3cd22c53dc30cad3664d Mon Sep 17 00:00:00 2001
From: Deepak Nibade <dnibade@nvidia.com>
Date: Thu, 4 Aug 2016 19:56:42 +0530
Subject: gpu: nvgpu: support GMMU mappings for vidmem page allocator

Switch to use page allocator for vidmem

Support GMMU mappings for page (non-contiguous page allocator)
in update_gmmu_ptes_locked()
If aperture is VIDMEM, traverse each chunk in an allocation
and map it to GPU VA separately

Fix CE page clearing to support page allocator

Fix gk20a_pramin_enter() to get base address from new
allocator
Define API gk20a_mem_get_vidmem_addr() to get base address
of allocation. Note that this API should not be used if we
have more than 1 chunk

Jira DNVGPU-96

Change-Id: I725422f3538aeb477ca4220ba57ef8b3c53db703
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: http://git-master/r/1199177
(cherry picked from commit 1afae6ee6529ab88cedd5bcbe458fbdc0d4b1fd8)
Reviewed-on: http://git-master/r/1197647
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 210 ++++++++++++++++++++++++++-----------
 1 file changed, 149 insertions(+), 61 deletions(-)

(limited to 'drivers/gpu/nvgpu/gk20a/mm_gk20a.c')

diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 7c731890..dde798cf 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -29,6 +29,7 @@
 #include <linux/lcm.h>
 #include <uapi/linux/nvgpu.h>
 #include <trace/events/gk20a.h>
+#include <gk20a/page_allocator_priv.h>
 
 #include "gk20a.h"
 #include "mm_gk20a.h"
@@ -84,10 +85,31 @@ void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem)
 	mem->cpu_va = NULL;
 }
 
+static u64 gk20a_mem_get_vidmem_addr(struct gk20a *g, struct mem_desc *mem)
+{
+	struct gk20a_page_alloc *alloc;
+	struct page_alloc_chunk *chunk;
+
+	if (mem && mem->aperture == APERTURE_VIDMEM) {
+		alloc = (struct gk20a_page_alloc *)
+				sg_dma_address(mem->sgt->sgl);
+
+		/* This API should not be used with > 1 chunks */
+		if (alloc->nr_chunks != 1)
+			return 0;
+
+		chunk = list_first_entry(&alloc->alloc_chunks,
+				struct page_alloc_chunk, list_entry);
+		return chunk->base;
+	}
+
+	return 0;
+}
+
 /* WARNING: returns pramin_window_lock taken, complement with pramin_exit() */
 static u32 gk20a_pramin_enter(struct gk20a *g, struct mem_desc *mem, u32 w)
 {
-	u64 bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
+	u64 bufbase = gk20a_mem_get_vidmem_addr(g, mem);
 	u64 addr = bufbase + w * sizeof(u32);
 	u32 hi = (u32)((addr & ~(u64)0xfffff)
 		>> bus_bar0_window_target_bar0_window_base_shift_v());
@@ -765,9 +787,7 @@ static int gk20a_init_vidmem(struct mm_gk20a *mm)
 		return 0;
 
 	err = gk20a_page_allocator_init(&g->mm.vidmem.allocator, "vidmem",
-					SZ_4K, size - SZ_4K, SZ_4K,
-					GPU_ALLOC_FORCE_CONTIG |
-					GPU_ALLOC_NO_SCATTER_GATHER);
+					SZ_4K, size - SZ_4K, SZ_4K, 0);
 	if (err) {
 		gk20a_err(d, "Failed to register vidmem for size %zu: %d",
 				size, err);
@@ -2721,7 +2741,6 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
 #if defined(CONFIG_GK20A_VIDMEM)
 	u64 addr;
 	int err;
-	bool need_pramin_access = true;
 
 	gk20a_dbg_fn("");
 
@@ -2764,13 +2783,22 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
 
 	if (g->mm.vidmem.ce_ctx_id != ~0) {
 		struct gk20a_fence *gk20a_fence_out = NULL;
-		u64 dst_bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
+		struct gk20a_fence *gk20a_last_fence = NULL;
+		struct gk20a_page_alloc *alloc = NULL;
+		struct page_alloc_chunk *chunk = NULL;
 
-		err = gk20a_ce_execute_ops(g->dev,
+		alloc = (struct gk20a_page_alloc *)
+				g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
+
+		list_for_each_entry(chunk, &alloc->alloc_chunks, list_entry) {
+			if (gk20a_last_fence)
+				gk20a_fence_put(gk20a_last_fence);
+
+			err = gk20a_ce_execute_ops(g->dev,
 				g->mm.vidmem.ce_ctx_id,
 				0,
-				dst_bufbase,
-				(u64)size,
+				chunk->base,
+				chunk->length,
 				0x00000000,
 				NVGPU_CE_DST_LOCATION_LOCAL_FB,
 				NVGPU_CE_MEMSET,
@@ -2778,27 +2806,31 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
 				0,
 				&gk20a_fence_out);
 
-		if (!err) {
-			if (gk20a_fence_out) {
-				err = gk20a_fence_wait(gk20a_fence_out, gk20a_get_gr_idle_timeout(g));
-				gk20a_fence_put(gk20a_fence_out);
-				if (err)
-					gk20a_err(g->dev,
-						"Failed to get the fence_out from CE execute ops");
-				else
-					need_pramin_access = false;
+			if (err) {
+				gk20a_err(g->dev,
+					"Failed gk20a_ce_execute_ops[%d]", err);
+				goto fail_free_table;
 			}
-		} else
-			gk20a_err(g->dev, "Failed gk20a_ce_execute_ops[%d]",err);
-	}
 
-	if (need_pramin_access)
-		gk20a_memset(g, mem, 0, 0, size);
+			gk20a_last_fence = gk20a_fence_out;
+		}
+
+		if (gk20a_last_fence) {
+			err = gk20a_fence_wait(gk20a_last_fence,
+					gk20a_get_gr_idle_timeout(g));
+			gk20a_fence_put(gk20a_last_fence);
+			if (err)
+				gk20a_err(g->dev,
+					"Failed to get the fence_out from CE execute ops");
+		}
+	}
 
 	gk20a_dbg_fn("done at 0x%llx size %zu", addr, size);
 
 	return 0;
 
+fail_free_table:
+	sg_free_table(mem->sgt);
 fail_kfree:
 	kfree(mem->sgt);
 fail_physfree:
@@ -3381,13 +3413,9 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 	u32 page_size  = vm->gmmu_page_sizes[pgsz_idx];
 	int err;
 	struct scatterlist *sgl = NULL;
-
-	gk20a_dbg(gpu_dbg_pte, "size_idx=%d, iova=%llx, buffer offset %lld, nents %d",
-		   pgsz_idx,
-		   sgt ? g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0)
-		       : 0ULL,
-		   buffer_offset,
-		   sgt ? sgt->nents : 0);
+	struct gk20a_page_alloc *alloc = NULL;
+	struct page_alloc_chunk *chunk = NULL;
+	u64 length;
 
 	/* note: here we need to map kernel to small, since the
 	 * low-level mmu code assumes 0 is small and 1 is big pages */
@@ -3397,45 +3425,105 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 	if (space_to_skip & (page_size - 1))
 		return -EINVAL;
 
-	if (sgt) {
-		iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
-		if (!vm->mm->bypass_smmu && iova) {
-			iova += space_to_skip;
+	err = map_gmmu_pages(g, &vm->pdb);
+	if (err) {
+		gk20a_err(dev_from_vm(vm),
+			   "couldn't map ptes for update as=%d",
+			   vm_aspace_id(vm));
+		return err;
+	}
+
+	if (aperture == APERTURE_VIDMEM) {
+		gk20a_dbg(gpu_dbg_map_v, "vidmem map size_idx=%d, gpu_va=[%llx,%llx], alloc=%llx",
+				pgsz_idx, gpu_va, gpu_end-1, iova);
+
+		if (sgt) {
+			alloc = (struct gk20a_page_alloc *)
+				g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
+
+			list_for_each_entry(chunk, &alloc->alloc_chunks,
+							list_entry) {
+				if (space_to_skip &&
+				    space_to_skip > chunk->length) {
+					space_to_skip -= chunk->length;
+				} else {
+					iova = chunk->base + space_to_skip;
+					length = chunk->length - space_to_skip;
+					space_to_skip = 0;
+
+					err = update_gmmu_level_locked(vm,
+						&vm->pdb, pgsz_idx,
+						&sgl,
+						&space_to_skip,
+						&iova,
+						gpu_va, gpu_va + length,
+						kind_v, &ctag,
+						cacheable, unmapped_pte,
+						rw_flag, sparse, 0, priv,
+						aperture);
+
+					/* need to set explicit zero here */
+					space_to_skip = 0;
+					gpu_va += length;
+				}
+			}
 		} else {
-			sgl = sgt->sgl;
-
-			gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
-					(u64)sg_phys(sgl),
-					sgl->length);
-			while (space_to_skip && sgl &&
-			       space_to_skip + page_size > sgl->length) {
-				space_to_skip -= sgl->length;
-				sgl = sg_next(sgl);
+			err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
+					&sgl,
+					&space_to_skip,
+					&iova,
+					gpu_va, gpu_end,
+					kind_v, &ctag,
+					cacheable, unmapped_pte, rw_flag,
+					sparse, 0, priv,
+					aperture);
+		}
+	} else {
+		gk20a_dbg(gpu_dbg_pte, "size_idx=%d, iova=%llx, buffer offset %lld, nents %d",
+			   pgsz_idx,
+			   sgt ? g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0)
+			       : 0ULL,
+			   buffer_offset,
+			   sgt ? sgt->nents : 0);
+
+		gk20a_dbg(gpu_dbg_map_v, "size_idx=%d, gpu_va=[%llx,%llx], iova=%llx",
+				pgsz_idx, gpu_va, gpu_end-1, iova);
+
+		if (sgt) {
+			iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
+			if (!vm->mm->bypass_smmu && iova) {
+				iova += space_to_skip;
+			} else {
+				sgl = sgt->sgl;
+
 				gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
 						(u64)sg_phys(sgl),
 						sgl->length);
+
+				while (space_to_skip && sgl &&
+				      space_to_skip + page_size > sgl->length) {
+					space_to_skip -= sgl->length;
+					sgl = sg_next(sgl);
+					gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
+							(u64)sg_phys(sgl),
+							sgl->length);
+				}
+
+				iova = sg_phys(sgl) + space_to_skip;
 			}
-			iova = sg_phys(sgl) + space_to_skip;
 		}
-	}
 
-	gk20a_dbg(gpu_dbg_map_v, "size_idx=%d, gpu_va=[%llx,%llx], iova=%llx",
-			pgsz_idx, gpu_va, gpu_end-1, iova);
-	err = map_gmmu_pages(g, &vm->pdb);
-	if (err) {
-		gk20a_err(dev_from_vm(vm),
-			   "couldn't map ptes for update as=%d",
-			   vm_aspace_id(vm));
-		return err;
+		err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
+				&sgl,
+				&space_to_skip,
+				&iova,
+				gpu_va, gpu_end,
+				kind_v, &ctag,
+				cacheable, unmapped_pte, rw_flag,
+				sparse, 0, priv,
+				aperture);
 	}
-	err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
-			&sgl,
-			&space_to_skip,
-			&iova,
-			gpu_va, gpu_end,
-			kind_v, &ctag,
-			cacheable, unmapped_pte, rw_flag, sparse, 0, priv,
-			aperture);
+
 	unmap_gmmu_pages(g, &vm->pdb);
 
 	smp_mb();
-- 
cgit v1.2.2