From b8915ab5aabb02866019221c51d96f304658207f Mon Sep 17 00:00:00 2001
From: Konsta Holtta <kholtta@nvidia.com>
Date: Fri, 17 Jun 2016 15:56:07 +0300
Subject: gpu: nvgpu: support in-kernel vidmem mappings

Propagate the buffer aperture flag in gk20a_locked_gmmu_map up so that
buffers represented as a mem_desc and present in vidmem can be mapped to
gpu.

JIRA DNVGPU-18
JIRA DNVGPU-76

Change-Id: I46cf87e27229123016727339b9349d5e2c835b3e
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: http://git-master/r/1169308
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/cde_gk20a.c            |  3 +-
 drivers/gpu/nvgpu/gk20a/gk20a.h                |  3 +-
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c             | 66 ++++++++++++--------------
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c             | 38 ++++++++++-----
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h             | 16 +++++--
 drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c |  7 +--
 drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c      |  6 ++-
 drivers/gpu/nvgpu/gm20b/acr_gm20b.c            |  6 ++-
 drivers/gpu/nvgpu/vgpu/mm_vgpu.c               |  3 +-
 9 files changed, 85 insertions(+), 63 deletions(-)

(limited to 'drivers/gpu')

diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index 7818f046..02b1938a 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -1215,7 +1215,8 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
 			       g->gr.compbit_store.mem.size,
 			       NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
 			       gk20a_mem_flag_read_only,
-			       false);
+			       false,
+			       gr->compbit_store.mem.aperture);
 
 	if (!vaddr) {
 		gk20a_warn(cde_ctx->dev, "cde: cannot map compression bit backing store");
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 45e16ad9..b8a2fc3e 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -492,7 +492,8 @@ struct gpu_ops {
 				bool clear_ctags,
 				bool sparse,
 				bool priv,
-				struct vm_gk20a_mapping_batch *batch);
+				struct vm_gk20a_mapping_batch *batch,
+				enum gk20a_aperture aperture);
 		void (*gmmu_unmap)(struct vm_gk20a *vm,
 				u64 vaddr,
 				u64 size,
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index bdc65cab..0d97e84c 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -1824,7 +1824,8 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 							&pm_ctx->mem.sgt,
 							pm_ctx->mem.size,
 							NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
-							gk20a_mem_flag_none, true);
+							gk20a_mem_flag_none, true,
+							pm_ctx->mem.aperture);
 			if (!pm_ctx->mem.gpu_va) {
 				gk20a_err(dev_from_gk20a(g),
 					"failed to map pm ctxt buffer");
@@ -2046,7 +2047,8 @@ static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
 					ucode_info->surface_desc.size,
 					0, /* flags */
 					gk20a_mem_flag_read_only,
-					false);
+					false,
+					ucode_info->surface_desc.aperture);
 	if (!ucode_info->surface_desc.gpu_va) {
 		gk20a_err(d, "failed to update gmmu ptes\n");
 		return -ENOMEM;
@@ -2650,82 +2652,73 @@ static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
 	u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
 	u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
 	struct gr_gk20a *gr = &g->gr;
-	struct sg_table *sgt;
-	u64 size;
+	struct mem_desc *mem;
 	u64 gpu_va;
 	u32 i;
 	gk20a_dbg_fn("");
 
 	/* Circular Buffer */
 	if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].mem.sgt == NULL)) {
-		sgt = gr->global_ctx_buffer[CIRCULAR].mem.sgt;
-		size = gr->global_ctx_buffer[CIRCULAR].mem.size;
+		mem = &gr->global_ctx_buffer[CIRCULAR].mem;
 	} else {
-		sgt = gr->global_ctx_buffer[CIRCULAR_VPR].mem.sgt;
-		size = gr->global_ctx_buffer[CIRCULAR_VPR].mem.size;
+		mem = &gr->global_ctx_buffer[CIRCULAR_VPR].mem;
 	}
 
-	gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
+	gpu_va = gk20a_gmmu_map(ch_vm, &mem->sgt, mem->size,
 				NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
-				gk20a_mem_flag_none, true);
+				gk20a_mem_flag_none, true, mem->aperture);
 	if (!gpu_va)
 		goto clean_up;
 	g_bfr_va[CIRCULAR_VA] = gpu_va;
-	g_bfr_size[CIRCULAR_VA] = size;
+	g_bfr_size[CIRCULAR_VA] = mem->size;
 
 	/* Attribute Buffer */
 	if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].mem.sgt == NULL)) {
-		sgt = gr->global_ctx_buffer[ATTRIBUTE].mem.sgt;
-		size = gr->global_ctx_buffer[ATTRIBUTE].mem.size;
+		mem = &gr->global_ctx_buffer[ATTRIBUTE].mem;
 	} else {
-		sgt = gr->global_ctx_buffer[ATTRIBUTE_VPR].mem.sgt;
-		size = gr->global_ctx_buffer[ATTRIBUTE_VPR].mem.size;
+		mem = &gr->global_ctx_buffer[ATTRIBUTE_VPR].mem;
 	}
 
-	gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
+	gpu_va = gk20a_gmmu_map(ch_vm, &mem->sgt, mem->size,
 				NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
-				gk20a_mem_flag_none, false);
+				gk20a_mem_flag_none, false, mem->aperture);
 	if (!gpu_va)
 		goto clean_up;
 	g_bfr_va[ATTRIBUTE_VA] = gpu_va;
-	g_bfr_size[ATTRIBUTE_VA] = size;
+	g_bfr_size[ATTRIBUTE_VA] = mem->size;
 
 	/* Page Pool */
 	if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].mem.sgt == NULL)) {
-		sgt = gr->global_ctx_buffer[PAGEPOOL].mem.sgt;
-		size = gr->global_ctx_buffer[PAGEPOOL].mem.size;
+		mem = &gr->global_ctx_buffer[PAGEPOOL].mem;
 	} else {
-		sgt = gr->global_ctx_buffer[PAGEPOOL_VPR].mem.sgt;
-		size = gr->global_ctx_buffer[PAGEPOOL_VPR].mem.size;
+		mem = &gr->global_ctx_buffer[PAGEPOOL_VPR].mem;
 	}
 
-	gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
+	gpu_va = gk20a_gmmu_map(ch_vm, &mem->sgt, mem->size,
 				NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
-				gk20a_mem_flag_none, true);
+				gk20a_mem_flag_none, true, mem->aperture);
 	if (!gpu_va)
 		goto clean_up;
 	g_bfr_va[PAGEPOOL_VA] = gpu_va;
-	g_bfr_size[PAGEPOOL_VA] = size;
+	g_bfr_size[PAGEPOOL_VA] = mem->size;
 
 	/* Golden Image */
-	sgt = gr->global_ctx_buffer[GOLDEN_CTX].mem.sgt;
-	size = gr->global_ctx_buffer[GOLDEN_CTX].mem.size;
-	gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
-				gk20a_mem_flag_none, true);
+	mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
+	gpu_va = gk20a_gmmu_map(ch_vm, &mem->sgt, mem->size, 0,
+				gk20a_mem_flag_none, true, mem->aperture);
 	if (!gpu_va)
 		goto clean_up;
 	g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
-	g_bfr_size[GOLDEN_CTX_VA] = size;
+	g_bfr_size[GOLDEN_CTX_VA] = mem->size;
 
 	/* Priv register Access Map */
-	sgt = gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.sgt;
-	size = gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.size;
-	gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
-				gk20a_mem_flag_none, true);
+	mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
+	gpu_va = gk20a_gmmu_map(ch_vm, &mem->sgt, mem->size, 0,
+				gk20a_mem_flag_none, true, mem->aperture);
 	if (!gpu_va)
 		goto clean_up;
 	g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
-	g_bfr_size[PRIV_ACCESS_MAP_VA] = size;
+	g_bfr_size[PRIV_ACCESS_MAP_VA] = mem->size;
 
 	c->ch_ctx.global_ctx_buffer_mapped = true;
 	return 0;
@@ -2793,7 +2786,8 @@ int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
 
 	gr_ctx->mem.gpu_va = gk20a_gmmu_map(vm, &gr_ctx->mem.sgt, gr_ctx->mem.size,
 					NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
-					gk20a_mem_flag_none, true);
+					gk20a_mem_flag_none, true,
+					gr_ctx->mem.aperture);
 	if (!gr_ctx->mem.gpu_va)
 		goto err_free_mem;
 
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 6fdfacdd..bb32749d 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1594,7 +1594,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 			bool clear_ctags,
 			bool sparse,
 			bool priv,
-			struct vm_gk20a_mapping_batch *batch)
+			struct vm_gk20a_mapping_batch *batch,
+			enum gk20a_aperture aperture)
 {
 	int err = 0;
 	bool allocated = false;
@@ -1642,7 +1643,7 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 				      rw_flag,
 				      sparse,
 				      priv,
-				      APERTURE_SYSMEM); /* no vidmem bufs yet */
+				      aperture);
 	if (err) {
 		gk20a_err(d, "failed to update ptes on map");
 		goto fail_validate;
@@ -1998,7 +1999,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
 					clear_ctags,
 					false,
 					false,
-					batch);
+					batch,
+					APERTURE_SYSMEM); /* no vidmem yet */
 	if (!map_offset)
 		goto clean_up;
 
@@ -2256,7 +2258,8 @@ int gk20a_vm_map_compbits(struct vm_gk20a *vm,
 				false, /* clear_ctags */
 				false, /* sparse */
 				false, /* priv */
-				NULL); /* mapping_batch handle */
+				NULL,  /* mapping_batch handle */
+				g->gr.compbit_store.mem.aperture);
 
 		if (!mapped_buffer->ctag_map_win_addr) {
 			mutex_unlock(&vm->update_gmmu_lock);
@@ -2295,7 +2298,8 @@ static u64 __gk20a_gmmu_map(struct vm_gk20a *vm,
 			    u64 size,
 			    u32 flags,
 			    int rw_flag,
-			    bool priv)
+			    bool priv,
+			    enum gk20a_aperture aperture)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
 	u64 vaddr;
@@ -2312,7 +2316,8 @@ static u64 __gk20a_gmmu_map(struct vm_gk20a *vm,
 				false, /* clear_ctags */
 				false, /* sparse */
 				priv, /* priv */
-				NULL); /* mapping_batch handle */
+				NULL, /* mapping_batch handle */
+				aperture);
 	mutex_unlock(&vm->update_gmmu_lock);
 	if (!vaddr) {
 		gk20a_err(dev_from_vm(vm), "failed to allocate va space");
@@ -2327,9 +2332,11 @@ u64 gk20a_gmmu_map(struct vm_gk20a *vm,
 		   u64 size,
 		   u32 flags,
 		   int rw_flag,
-		   bool priv)
+		   bool priv,
+		   enum gk20a_aperture aperture)
 {
-	return __gk20a_gmmu_map(vm, sgt, 0, size, flags, rw_flag, priv);
+	return __gk20a_gmmu_map(vm, sgt, 0, size, flags, rw_flag, priv,
+			aperture);
 }
 
 /*
@@ -2341,9 +2348,11 @@ u64 gk20a_gmmu_fixed_map(struct vm_gk20a *vm,
 			 u64 size,
 			 u32 flags,
 			 int rw_flag,
-			 bool priv)
+			 bool priv,
+			 enum gk20a_aperture aperture)
 {
-	return __gk20a_gmmu_map(vm, sgt, addr, size, flags, rw_flag, priv);
+	return __gk20a_gmmu_map(vm, sgt, addr, size, flags, rw_flag, priv,
+			aperture);
 }
 
 int gk20a_gmmu_alloc(struct gk20a *g, size_t size, struct mem_desc *mem)
@@ -2599,7 +2608,8 @@ int gk20a_gmmu_alloc_map_attr(struct vm_gk20a *vm,
 		return err;
 
 	mem->gpu_va = gk20a_gmmu_map(vm, &mem->sgt, size, 0,
-				     gk20a_mem_flag_none, false);
+				     gk20a_mem_flag_none, false,
+				     mem->aperture);
 	if (!mem->gpu_va) {
 		err = -ENOMEM;
 		goto fail_free;
@@ -2626,7 +2636,8 @@ int gk20a_gmmu_alloc_map_attr_vid(struct vm_gk20a *vm,
 		return err;
 
 	mem->gpu_va = gk20a_gmmu_map(vm, &mem->sgt, size, 0,
-				     gk20a_mem_flag_none, false);
+				     gk20a_mem_flag_none, false,
+				     mem->aperture);
 	if (!mem->gpu_va) {
 		err = -ENOMEM;
 		goto fail_free;
@@ -3727,7 +3738,8 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 					 false,
 					 true,
 					 false,
-					 NULL);
+					 NULL,
+					 APERTURE_INVALID);
 		if (!map_offset) {
 			mutex_unlock(&vm->update_gmmu_lock);
 			gk20a_bfree(vma, vaddr_start);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index a697e520..f87ba605 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -40,8 +40,13 @@
 		outer_flush_range(pa, pa + (size_t)(size));		\
 	} while (0)
 
+/*
+ * Real location of a buffer - gk20a_aperture_mask() will deduce what will be
+ * told to the gpu about the aperture, but this flag designates where the
+ * memory actually was allocated from.
+ */
 enum gk20a_aperture {
-	APERTURE_INVALID, /* e.g., unallocated */
+	APERTURE_INVALID, /* unallocated or N/A */
 	APERTURE_SYSMEM,
 	APERTURE_VIDMEM
 };
@@ -520,14 +525,16 @@ u64 gk20a_gmmu_map(struct vm_gk20a *vm,
 		u64 size,
 		u32 flags,
 		int rw_flag,
-		bool priv);
+		bool priv,
+		enum gk20a_aperture aperture);
 u64 gk20a_gmmu_fixed_map(struct vm_gk20a *vm,
 		struct sg_table **sgt,
 		u64 addr,
 		u64 size,
 		u32 flags,
 		int rw_flag,
-		bool priv);
+		bool priv,
+		enum gk20a_aperture aperture);
 
 int gk20a_gmmu_alloc_map(struct vm_gk20a *vm,
 		size_t size,
@@ -619,7 +626,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 			bool clear_ctags,
 			bool sparse,
 			bool priv,
-			struct vm_gk20a_mapping_batch *batch);
+			struct vm_gk20a_mapping_batch *batch,
+			enum gk20a_aperture aperture);
 
 void gk20a_gmmu_unmap(struct vm_gk20a *vm,
 		u64 vaddr,
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
index b8f70ab3..25f9a8dd 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
@@ -130,9 +130,6 @@ int gk20a_tegra_secure_alloc(struct device *dev,
 	if (dma_mapping_error(&tegra_vpr_dev, iova))
 		return -ENOMEM;
 
-	desc->mem.size = size;
-	desc->destroy = gk20a_tegra_secure_destroy;
-
 	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
 	if (!sgt) {
 		gk20a_err(dev, "failed to allocate memory\n");
@@ -148,7 +145,11 @@ int gk20a_tegra_secure_alloc(struct device *dev,
 	/* This bypasses SMMU for VPR during gmmu_map. */
 	sg_dma_address(sgt->sgl) = 0;
 
+	desc->destroy = gk20a_tegra_secure_destroy;
+
 	desc->mem.sgt = sgt;
+	desc->mem.size = size;
+	desc->mem.aperture = APERTURE_SYSMEM;
 
 	return err;
 
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
index aa375b24..113c59ef 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -187,7 +187,8 @@ int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
 
 	/* Map into the GPU... Doesn't need to be fixed. */
 	p->gpu_va = gk20a_gmmu_map(vm, &p->rw_sg_table, PAGE_SIZE,
-				   0, gk20a_mem_flag_none, false);
+				   0, gk20a_mem_flag_none, false,
+				   APERTURE_SYSMEM);
 	if (!p->gpu_va) {
 		err = -ENOMEM;
 		goto fail_unmap_sgt;
@@ -204,7 +205,8 @@ int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
 				    p->sema_sea->gpu_va, p->sema_sea->map_size,
 				    0,
 				    gk20a_mem_flag_read_only,
-				    false);
+				    false,
+				    APERTURE_SYSMEM);
 	if (!addr) {
 		err = -ENOMEM;
 		BUG();
diff --git a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
index eb9ae08c..c503bc48 100644
--- a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
@@ -412,7 +412,8 @@ int prepare_ucode_blob(struct gk20a *g)
 	sg_dma_address(sgt->sgl) = 0;
 
 	g->pmu.wpr_buf.gpu_va = gk20a_gmmu_map(vm, &sgt, wprsize,
-						0, gk20a_mem_flag_none, false);
+						0, gk20a_mem_flag_none, false,
+						APERTURE_SYSMEM);
 	gm20b_dbg_pmu("wpr mapped gpu va :%llx\n", g->pmu.wpr_buf.gpu_va);
 
 	/* Discover all managed falcons*/
@@ -1412,7 +1413,8 @@ int pmu_exec_gen_bl(struct gk20a *g, void *desc, u8 b_wait_for_halt)
 		acr->hsbl_ucode.gpu_va = gk20a_gmmu_map(vm, &acr->hsbl_ucode.sgt,
 				bl_sz,
 				0, /* flags */
-				gk20a_mem_flag_read_only, false);
+				gk20a_mem_flag_read_only, false,
+				acr->hsbl_ucode.aperture);
 		if (!acr->hsbl_ucode.gpu_va) {
 			gk20a_err(d, "failed to map pmu ucode memory!!");
 			goto err_free_ucode;
diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
index 8af01158..2239fcbc 100644
--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -80,7 +80,8 @@ static u64 vgpu_locked_gmmu_map(struct vm_gk20a *vm,
 				bool clear_ctags,
 				bool sparse,
 				bool priv,
-				struct vm_gk20a_mapping_batch *batch)
+				struct vm_gk20a_mapping_batch *batch,
+				enum gk20a_aperture aperture)
 {
 	int err = 0;
 	struct device *d = dev_from_vm(vm);
-- 
cgit v1.2.2