From 583704620db88e391f6b14acc57af859a70127de Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Fri, 9 Jun 2017 11:42:50 -0700
Subject: gpu: nvgpu: Implement PD packing

In some cases page directories require less than a full page of memory.
For example, on Pascal, the final PD level for large pages is only 256 bytes;
thus 16 PDs can fit in a single page. To allocate an entire page for each of
these 256 B PDs is extremely wasteful. This patch aims to alleviate the
wasted DMA memory from having small PDs in a full page by packing multiple
small PDs into a single page.

The packing is implemented as a slab allocator - each page is a slab and
from each page multiple PD instances can be allocated. Several modifications
to the nvgpu_gmmu_pd struct also needed to be made to support this. The
nvgpu_mem is now a pointer and there's an explicit offset into the nvgpu_mem
struct so that each nvgpu_gmmu_pd knows what portion of the memory it's
using.

The nvgpu_pde_phys_addr() function and the pd_write() functions also require
some changes since the PD no longer is always situated at the start of the
nvgpu_mem.

Initialization and cleanup of the page tables for each VM was slightly
modified to work through the new pd_cache implementation. Some PDs (i.e
the PDB), despite not being a full page, still require a full page for
alignment purposes (HW requirements). Thus a direct allocation method for
PDs is still provided. This is also used when a PD that could in principle
be cached is greater than a page in size.

Lastly a new debug flag was added for the pd_cache code.

JIRA NVGPU-30

Change-Id: I64c8037fc356783c1ef203cc143c4d71bbd5d77c
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: https://git-master/r/1506610
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
GVS: Gerrit_Virtual_Submit
---
 drivers/gpu/nvgpu/gk20a/gk20a.c    | 9 +++++++++
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 9 +++++----
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 4 ++++
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'drivers/gpu/nvgpu/gk20a')

diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 380c28ac..a0753770 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -25,6 +25,7 @@
 #include <nvgpu/soc.h>
 #include <nvgpu/enabled.h>
 #include <nvgpu/pmu.h>
+#include <nvgpu/gmmu.h>
 
 #include <trace/events/gk20a.h>
 
@@ -174,6 +175,14 @@ int gk20a_finalize_poweron(struct gk20a *g)
 		g->gpu_reset_done = true;
 	}
 
+	/*
+	 * Do this early so any early VMs that get made are capable of mapping
+	 * buffers.
+	 */
+	err = nvgpu_pd_cache_init(g);
+	if (err)
+		return err;
+
 	/* init interface layer support for PMU falcon */
 	nvgpu_flcn_sw_init(g, FALCON_ID_PMU);
 	nvgpu_flcn_sw_init(g, FALCON_ID_SEC2);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 558a1b06..0a84cabb 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -478,6 +478,7 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm)
 
 	gk20a_semaphore_sea_destroy(g);
 	gk20a_vidmem_destroy(g);
+	nvgpu_pd_cache_fini(g);
 }
 
 static int gk20a_alloc_sysmem_flush(struct gk20a *g)
@@ -1560,7 +1561,7 @@ static inline u32 big_valid_pde0_bits(struct gk20a *g,
 				      struct nvgpu_gmmu_pd *pd, u64 addr)
 {
 	u32 pde0_bits =
-		nvgpu_aperture_mask(g, &pd->mem,
+		nvgpu_aperture_mask(g, pd->mem,
 		  gmmu_pde_aperture_big_sys_mem_ncoh_f(),
 		  gmmu_pde_aperture_big_video_memory_f()) |
 		gmmu_pde_address_big_sys_f(
@@ -1573,7 +1574,7 @@ static inline u32 small_valid_pde1_bits(struct gk20a *g,
 					struct nvgpu_gmmu_pd *pd, u64 addr)
 {
 	u32 pde1_bits =
-		nvgpu_aperture_mask(g, &pd->mem,
+		nvgpu_aperture_mask(g, pd->mem,
 		  gmmu_pde_aperture_small_sys_mem_ncoh_f(),
 		  gmmu_pde_aperture_small_video_memory_f()) |
 		gmmu_pde_vol_small_true_f() | /* tbd: why? */
@@ -2173,14 +2174,14 @@ static int gk20a_init_ce_vm(struct mm_gk20a *mm)
 void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
 		struct vm_gk20a *vm)
 {
-	u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
+	u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0);
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);
 
 	gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
 
 	nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
-		nvgpu_aperture_mask(g, &vm->pdb.mem,
+		nvgpu_aperture_mask(g, vm->pdb.mem,
 		  ram_in_page_dir_base_target_sys_mem_ncoh_f(),
 		  ram_in_page_dir_base_target_vid_mem_f()) |
 		ram_in_page_dir_base_vol_true_f() |
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index a245d0e0..cadcffa4 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -31,6 +31,8 @@
 #include <nvgpu/rbtree.h>
 #include <nvgpu/kref.h>
 
+struct nvgpu_pd_cache;
+
 #ifdef CONFIG_ARM64
 #define outer_flush_range(a, b)
 #define __cpuc_flush_dcache_area __flush_dcache_area
@@ -217,6 +219,8 @@ struct mm_gk20a {
 		struct vm_gk20a *vm;
 	} ce;
 
+	struct nvgpu_pd_cache *pd_cache;
+
 	struct nvgpu_mutex l2_op_lock;
 	struct nvgpu_mutex tlb_lock;
 	struct nvgpu_mutex priv_lock;
-- 
cgit v1.2.2