From 048c6b062ae381a329dccbc7ca0599113dbd7417 Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Thu, 11 May 2017 18:25:47 +0100
Subject: gpu: nvgpu: Separate GMMU mapping impl from mm_gk20a.c

Separate the non-chip specific GMMU mapping implementation code
out of mm_gk20a.c. This puts all of the chip-agnostic code into
common/mm/gmmu.c in preparation for rewriting it.

JIRA NVGPU-12
JIRA NVGPU-30

Change-Id: I6f7fdac3422703f5e80bb22ad304dc27bba4814d
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: http://git-master/r/1480228
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/mm/gmmu.c | 517 +++++++++++++++++++++++++++++++++++++
 drivers/gpu/nvgpu/common/mm/vm.c   |  21 +-
 2 files changed, 536 insertions(+), 2 deletions(-)

(limited to 'drivers/gpu/nvgpu/common')

diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index dc91cc2f..e63155f2 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -15,14 +15,81 @@
  */
 
 #include <nvgpu/log.h>
+#include <nvgpu/list.h>
 #include <nvgpu/dma.h>
 #include <nvgpu/gmmu.h>
 #include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/page_allocator.h>
 
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
 
+#define gmmu_dbg(g, fmt, args...)			\
+	nvgpu_log(g, gpu_dbg_map, fmt, ##args)
+#define gmmu_dbg_v(g, fmt, args...)			\
+	nvgpu_log(g, gpu_dbg_map_v, fmt, ##args)
+
+static int map_gmmu_phys_pages(struct gk20a_mm_entry *entry)
+{
+	FLUSH_CPU_DCACHE(entry->mem.cpu_va,
+			 sg_phys(entry->mem.priv.sgt->sgl),
+			 entry->mem.priv.sgt->sgl->length);
+	return 0;
+}
+
+static void unmap_gmmu_phys_pages(struct gk20a_mm_entry *entry)
+{
+	FLUSH_CPU_DCACHE(entry->mem.cpu_va,
+			 sg_phys(entry->mem.priv.sgt->sgl),
+			 entry->mem.priv.sgt->sgl->length);
+}
+
+static int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
+{
+	gk20a_dbg_fn("");
+
+	if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL))
+		return map_gmmu_phys_pages(entry);
+
+	if (IS_ENABLED(CONFIG_ARM64)) {
+		if (entry->mem.aperture == APERTURE_VIDMEM)
+			return 0;
+
+		FLUSH_CPU_DCACHE(entry->mem.cpu_va,
+				 sg_phys(entry->mem.priv.sgt->sgl),
+				 entry->mem.size);
+	} else {
+		int err = nvgpu_mem_begin(g, &entry->mem);
+
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
+{
+	gk20a_dbg_fn("");
+
+	if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+		unmap_gmmu_phys_pages(entry);
+		return;
+	}
+
+	if (IS_ENABLED(CONFIG_ARM64)) {
+		if (entry->mem.aperture == APERTURE_VIDMEM)
+			return;
+
+		FLUSH_CPU_DCACHE(entry->mem.cpu_va,
+				 sg_phys(entry->mem.priv.sgt->sgl),
+				 entry->mem.size);
+	} else {
+		nvgpu_mem_end(g, &entry->mem);
+	}
+}
+
 static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order,
 				 struct gk20a_mm_entry *entry)
 {
@@ -97,6 +164,44 @@ static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
 	return 0;
 }
 
+static void free_gmmu_phys_pages(struct vm_gk20a *vm,
+			    struct gk20a_mm_entry *entry)
+{
+	gk20a_dbg_fn("");
+
+	/* note: mem_desc slightly abused (wrt. nvgpu_free_gmmu_pages) */
+
+	free_pages((unsigned long)entry->mem.cpu_va, get_order(entry->mem.size));
+	entry->mem.cpu_va = NULL;
+
+	sg_free_table(entry->mem.priv.sgt);
+	nvgpu_kfree(vm->mm->g, entry->mem.priv.sgt);
+	entry->mem.priv.sgt = NULL;
+	entry->mem.size = 0;
+	entry->mem.aperture = APERTURE_INVALID;
+}
+
+void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
+			   struct gk20a_mm_entry *entry)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	gk20a_dbg_fn("");
+
+	if (!entry->mem.size)
+		return;
+
+	if (entry->woffset) /* fake shadow mem */
+		return;
+
+	if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+		free_gmmu_phys_pages(vm, entry);
+		return;
+	}
+
+	nvgpu_dma_free(g, &entry->mem);
+}
+
 /*
  * Allocate a phys contig region big enough for a full
  * sized gmmu page table for the given gmmu_page_size.
@@ -202,6 +307,9 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
 	return vaddr;
 }
 
+/*
+ * Convenience wrapper over __nvgpu_gmmu_map() for non-fixed mappings.
+ */
 u64 nvgpu_gmmu_map(struct vm_gk20a *vm,
 		   struct nvgpu_mem *mem,
 		   u64 size,
@@ -246,3 +354,412 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va)
 
 	nvgpu_mutex_release(&vm->update_gmmu_lock);
 }
+
+static int update_gmmu_level_locked(struct vm_gk20a *vm,
+				    struct gk20a_mm_entry *pte,
+				    enum gmmu_pgsz_gk20a pgsz_idx,
+				    struct scatterlist **sgl,
+				    u64 *offset,
+				    u64 *iova,
+				    u64 gpu_va, u64 gpu_end,
+				    u8 kind_v, u64 *ctag,
+				    bool cacheable, bool unmapped_pte,
+				    int rw_flag,
+				    bool sparse,
+				    int lvl,
+				    bool priv,
+				    enum nvgpu_aperture aperture)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl];
+	const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1];
+	int err = 0;
+	u32 pde_i;
+	u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx];
+	struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL;
+
+	gk20a_dbg_fn("");
+
+	pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL))
+		>> (u64)l->lo_bit[pgsz_idx];
+
+	gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx",
+		  pgsz_idx, lvl, gpu_va, gpu_end-1, *iova);
+
+	while (gpu_va < gpu_end) {
+		u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end);
+
+		/* Allocate next level */
+		if (next_l->update_entry) {
+			if (!pte->entries) {
+				int num_entries =
+					1 <<
+					 (l->hi_bit[pgsz_idx]
+					  - l->lo_bit[pgsz_idx] + 1);
+				pte->entries =
+					nvgpu_vzalloc(g,
+						sizeof(struct gk20a_mm_entry) *
+						num_entries);
+				if (!pte->entries)
+					return -ENOMEM;
+				pte->pgsz = pgsz_idx;
+				pte->num_entries = num_entries;
+			}
+			prev_pte = next_pte;
+			next_pte = pte->entries + pde_i;
+
+			if (!next_pte->mem.size) {
+				err = nvgpu_zalloc_gmmu_page_table(vm,
+					pgsz_idx, next_l, next_pte, prev_pte);
+				if (err)
+					return err;
+			}
+		}
+
+		err = l->update_entry(vm, pte, pde_i, pgsz_idx,
+				sgl, offset, iova,
+				kind_v, ctag, cacheable, unmapped_pte,
+				rw_flag, sparse, priv, aperture);
+		if (err)
+			return err;
+
+		if (next_l->update_entry) {
+			/* get cpu access to the ptes */
+			err = map_gmmu_pages(g, next_pte);
+			if (err) {
+				nvgpu_err(g,
+					   "couldn't map ptes for update as=%d",
+					   vm_aspace_id(vm));
+				return err;
+			}
+			err = update_gmmu_level_locked(vm, next_pte,
+				pgsz_idx,
+				sgl,
+				offset,
+				iova,
+				gpu_va,
+				next,
+				kind_v, ctag, cacheable, unmapped_pte,
+				rw_flag, sparse, lvl+1, priv, aperture);
+			unmap_gmmu_pages(g, next_pte);
+
+			if (err)
+				return err;
+		}
+
+		pde_i++;
+		gpu_va = next;
+	}
+
+	gk20a_dbg_fn("done");
+
+	return 0;
+}
+
+/*
+ * This is the true top level GMMU mapping logic. This breaks down the incoming
+ * scatter gather table and does actual programming of GPU virtual address to
+ * physical* address.
+ *
+ * The update of each level of the page tables is farmed out to chip specific
+ * implementations. But the logic around that is generic to all chips. Every chip
+ * has some number of PDE levels and then a PTE level.
+ *
+ * Each chunk of the incoming SGT is sent to the chip specific implementation
+ * of page table update.
+ *
+ * [*] Note: the "physical" address may actually be an IO virtual address in the
+ *     case of SMMU usage.
+ */
+static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
+				   enum gmmu_pgsz_gk20a pgsz_idx,
+				   struct sg_table *sgt,
+				   u64 buffer_offset,
+				   u64 gpu_va, u64 gpu_end,
+				   u8 kind_v, u32 ctag_offset,
+				   bool cacheable, bool unmapped_pte,
+				   int rw_flag,
+				   bool sparse,
+				   bool priv,
+				   enum nvgpu_aperture aperture)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	int ctag_granularity = g->ops.fb.compression_page_size(g);
+	u64 ctag = (u64)ctag_offset * (u64)ctag_granularity;
+	u64 iova = 0;
+	u64 space_to_skip = buffer_offset;
+	u64 map_size = gpu_end - gpu_va;
+	u32 page_size  = vm->gmmu_page_sizes[pgsz_idx];
+	int err;
+	struct scatterlist *sgl = NULL;
+	struct nvgpu_page_alloc *alloc = NULL;
+	struct page_alloc_chunk *chunk = NULL;
+	u64 length;
+
+	/* note: here we need to map kernel to small, since the
+	 * low-level mmu code assumes 0 is small and 1 is big pages */
+	if (pgsz_idx == gmmu_page_size_kernel)
+		pgsz_idx = gmmu_page_size_small;
+
+	if (space_to_skip & (page_size - 1))
+		return -EINVAL;
+
+	err = map_gmmu_pages(g, &vm->pdb);
+	if (err) {
+		nvgpu_err(g,
+			   "couldn't map ptes for update as=%d",
+			   vm_aspace_id(vm));
+		return err;
+	}
+
+	if (aperture == APERTURE_VIDMEM) {
+		gmmu_dbg_v(g, "vidmem map size_idx=%d, gpu_va=[%llx,%llx]",
+			   pgsz_idx, gpu_va, gpu_end-1);
+
+		if (sgt) {
+			alloc = get_vidmem_page_alloc(sgt->sgl);
+
+			nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
+						 page_alloc_chunk, list_entry) {
+				if (space_to_skip &&
+				    space_to_skip > chunk->length) {
+					space_to_skip -= chunk->length;
+				} else {
+					iova = chunk->base + space_to_skip;
+					length = chunk->length - space_to_skip;
+					length = min(length, map_size);
+					space_to_skip = 0;
+
+					err = update_gmmu_level_locked(vm,
+						&vm->pdb, pgsz_idx,
+						&sgl,
+						&space_to_skip,
+						&iova,
+						gpu_va, gpu_va + length,
+						kind_v, &ctag,
+						cacheable, unmapped_pte,
+						rw_flag, sparse, 0, priv,
+						aperture);
+					if (err)
+						break;
+
+					/* need to set explicit zero here */
+					space_to_skip = 0;
+					gpu_va += length;
+					map_size -= length;
+
+					if (!map_size)
+						break;
+				}
+			}
+		} else {
+			err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
+					&sgl,
+					&space_to_skip,
+					&iova,
+					gpu_va, gpu_end,
+					kind_v, &ctag,
+					cacheable, unmapped_pte, rw_flag,
+					sparse, 0, priv,
+					aperture);
+		}
+	} else {
+		gmmu_dbg_v(g,
+			   "pgsz=%-6d, gpu_va: %#-12llx +%#-6llx  phys: %#-12llx "
+			   "buffer offset: %-4lld, nents: %d",
+			   page_size,
+			   gpu_va, gpu_end - gpu_va,
+			   sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
+			   buffer_offset,
+			   sgt ? sgt->nents : 0);
+
+		if (sgt) {
+			iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
+			if (!vm->mm->bypass_smmu && iova) {
+				iova += space_to_skip;
+			} else {
+				sgl = sgt->sgl;
+
+				gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
+						(u64)sg_phys(sgl),
+						sgl->length);
+
+				while (space_to_skip && sgl &&
+				      space_to_skip + page_size > sgl->length) {
+					space_to_skip -= sgl->length;
+					sgl = sg_next(sgl);
+					gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
+							(u64)sg_phys(sgl),
+							sgl->length);
+				}
+
+				iova = sg_phys(sgl) + space_to_skip;
+			}
+		}
+
+		err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
+				&sgl,
+				&space_to_skip,
+				&iova,
+				gpu_va, gpu_end,
+				kind_v, &ctag,
+				cacheable, unmapped_pte, rw_flag,
+				sparse, 0, priv,
+				aperture);
+	}
+
+	unmap_gmmu_pages(g, &vm->pdb);
+
+	mb();
+
+	gk20a_dbg_fn("done");
+
+	return err;
+}
+
+/**
+ * gk20a_locked_gmmu_map - Map a buffer into the GMMU
+ *
+ * This is for non-vGPU chips. It's part of the HAL at the moment but really
+ * should not be. Chip specific stuff is handled at the PTE/PDE programming
+ * layer. The rest of the logic is essentially generic for all chips.
+ *
+ * To call this function you must have locked the VM lock: vm->update_gmmu_lock.
+ * However, note: this function is not called directly. It's used through the
+ * mm.gmmu_lock() HAL. So before calling the mm.gmmu_lock() HAL make sure you
+ * have the update_gmmu_lock aquired.
+ */
+u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
+			u64 map_offset,
+			struct sg_table *sgt,
+			u64 buffer_offset,
+			u64 size,
+			int pgsz_idx,
+			u8 kind_v,
+			u32 ctag_offset,
+			u32 flags,
+			int rw_flag,
+			bool clear_ctags,
+			bool sparse,
+			bool priv,
+			struct vm_gk20a_mapping_batch *batch,
+			enum nvgpu_aperture aperture)
+{
+	int err = 0;
+	bool allocated = false;
+	struct gk20a *g = gk20a_from_vm(vm);
+	int ctag_granularity = g->ops.fb.compression_page_size(g);
+	u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity);
+
+	/* Allocate (or validate when map_offset != 0) the virtual address. */
+	if (!map_offset) {
+		map_offset = __nvgpu_vm_alloc_va(vm, size,
+					  pgsz_idx);
+		if (!map_offset) {
+			nvgpu_err(g, "failed to allocate va space");
+			err = -ENOMEM;
+			goto fail_alloc;
+		}
+		allocated = true;
+	}
+
+	gmmu_dbg(g,
+		 "gv: 0x%04x_%08x + 0x%-7llx "
+		 "[dma: 0x%02x_%08x, pa: 0x%02x_%08x] "
+		 "pgsz=%-3dKb as=%-2d ctags=%d start=%d "
+		 "kind=0x%x flags=0x%x apt=%s",
+		 u64_hi32(map_offset), u64_lo32(map_offset), size,
+		 sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0,
+		 sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0,
+		 sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0,
+		 sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0,
+		 vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm),
+		 ctag_lines, ctag_offset,
+		 kind_v, flags, nvgpu_aperture_str(aperture));
+
+	err = update_gmmu_ptes_locked(vm, pgsz_idx,
+				      sgt,
+				      buffer_offset,
+				      map_offset, map_offset + size,
+				      kind_v,
+				      ctag_offset,
+				      flags &
+				      NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				      flags &
+				      NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE,
+				      rw_flag,
+				      sparse,
+				      priv,
+				      aperture);
+	if (err) {
+		nvgpu_err(g, "failed to update ptes on map");
+		goto fail_validate;
+	}
+
+	if (!batch)
+		g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
+	else
+		batch->need_tlb_invalidate = true;
+
+	return map_offset;
+fail_validate:
+	if (allocated)
+		__nvgpu_vm_free_va(vm, map_offset, pgsz_idx);
+fail_alloc:
+	nvgpu_err(g, "%s: failed with err=%d", __func__, err);
+	return 0;
+}
+
+void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
+			u64 vaddr,
+			u64 size,
+			int pgsz_idx,
+			bool va_allocated,
+			int rw_flag,
+			bool sparse,
+			struct vm_gk20a_mapping_batch *batch)
+{
+	int err = 0;
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	if (va_allocated) {
+		err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
+		if (err) {
+			nvgpu_err(g, "failed to free va");
+			return;
+		}
+	}
+
+	/* unmap here needs to know the page size we assigned at mapping */
+	err = update_gmmu_ptes_locked(vm,
+				pgsz_idx,
+				NULL, /* n/a for unmap */
+				0,
+				vaddr,
+				vaddr + size,
+				0, 0, false /* n/a for unmap */,
+				false, rw_flag,
+				sparse, 0,
+				APERTURE_INVALID); /* don't care for unmap */
+	if (err)
+		nvgpu_err(g, "failed to update gmmu ptes on unmap");
+
+	/* flush l2 so any dirty lines are written out *now*.
+	 *  also as we could potentially be switching this buffer
+	 * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
+	 * some point in the future we need to invalidate l2.  e.g. switching
+	 * from a render buffer unmap (here) to later using the same memory
+	 * for gmmu ptes.  note the positioning of this relative to any smmu
+	 * unmapping (below). */
+
+	if (!batch) {
+		gk20a_mm_l2_flush(g, true);
+		g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
+	} else {
+		if (!batch->gpu_l2_flushed) {
+			gk20a_mm_l2_flush(g, true);
+			batch->gpu_l2_flushed = true;
+		}
+		batch->need_tlb_invalidate = true;
+	}
+}
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index e24d40bf..5ba386c9 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -18,6 +18,7 @@
 #include <nvgpu/dma.h>
 #include <nvgpu/vm.h>
 #include <nvgpu/vm_area.h>
+#include <nvgpu/gmmu.h>
 #include <nvgpu/lock.h>
 #include <nvgpu/list.h>
 #include <nvgpu/rbtree.h>
@@ -34,6 +35,22 @@ int vm_aspace_id(struct vm_gk20a *vm)
 	return vm->as_share ? vm->as_share->id : -1;
 }
 
+static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
+				  struct gk20a_mm_entry *parent,
+				  int level)
+{
+	int i;
+
+	if (parent->entries)
+		for (i = 0; i < parent->num_entries; i++)
+			nvgpu_vm_free_entries(vm, &parent->entries[i], level+1);
+
+	if (parent->mem.size)
+		nvgpu_free_gmmu_pages(vm, parent);
+	nvgpu_vfree(vm->mm->g, parent->entries);
+	parent->entries = NULL;
+}
+
 u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
 			enum gmmu_pgsz_gk20a pgsz_idx)
 
@@ -421,7 +438,7 @@ clean_up_allocators:
 clean_up_page_tables:
 	/* Cleans up nvgpu_vm_init_page_tables() */
 	nvgpu_vfree(g, vm->pdb.entries);
-	free_gmmu_pages(vm, &vm->pdb);
+	nvgpu_free_gmmu_pages(vm, &vm->pdb);
 clean_up_vgpu_vm:
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
 	if (g->is_virtual)
@@ -537,7 +554,7 @@ static void __nvgpu_vm_remove(struct vm_gk20a *vm)
 	if (nvgpu_alloc_initialized(&vm->user_lp))
 		nvgpu_alloc_destroy(&vm->user_lp);
 
-	gk20a_vm_free_entries(vm, &vm->pdb, 0);
+	nvgpu_vm_free_entries(vm, &vm->pdb, 0);
 
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
 	if (g->is_virtual)
-- 
cgit v1.2.2