From 583704620db88e391f6b14acc57af859a70127de Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Fri, 9 Jun 2017 11:42:50 -0700
Subject: gpu: nvgpu: Implement PD packing

In some cases page directories require less than a full page of memory.
For example, on Pascal, the final PD level for large pages is only 256 bytes;
thus 16 PDs can fit in a single page. To allocate an entire page for each of
these 256 B PDs is extremely wasteful. This patch aims to alleviate the
wasted DMA memory from having small PDs in a full page by packing multiple
small PDs into a single page.

The packing is implemented as a slab allocator - each page is a slab and
from each page multiple PD instances can be allocated. Several modifications
to the nvgpu_gmmu_pd struct also needed to be made to support this. The
nvgpu_mem is now a pointer and there's an explicit offset into the nvgpu_mem
struct so that each nvgpu_gmmu_pd knows what portion of the memory it's
using.

The nvgpu_pde_phys_addr() function and the pd_write() functions also require
some changes since the PD no longer is always situated at the start of the
nvgpu_mem.

Initialization and cleanup of the page tables for each VM was slightly
modified to work through the new pd_cache implementation. Some PDs (i.e
the PDB), despite not being a full page, still require a full page for
alignment purposes (HW requirements). Thus a direct allocation method for
PDs is still provided. This is also used when a PD that could in principle
be cached is greater than a page in size.

Lastly a new debug flag was added for the pd_cache code.

JIRA NVGPU-30

Change-Id: I64c8037fc356783c1ef203cc143c4d71bbd5d77c
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: https://git-master/r/1506610
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
GVS: Gerrit_Virtual_Submit
---
 drivers/gpu/nvgpu/Makefile.nvgpu       |   1 +
 drivers/gpu/nvgpu/common/mm/gmmu.c     |  81 +++----
 drivers/gpu/nvgpu/common/mm/pd_cache.c | 426 +++++++++++++++++++++++++++++++++
 drivers/gpu/nvgpu/common/mm/vm.c       |  50 ++--
 drivers/gpu/nvgpu/gk20a/gk20a.c        |   9 +
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c     |   9 +-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h     |   4 +
 drivers/gpu/nvgpu/gp10b/mm_gp10b.c     |  10 +-
 drivers/gpu/nvgpu/include/nvgpu/gmmu.h |  91 ++++++-
 drivers/gpu/nvgpu/include/nvgpu/log.h  |   1 +
 10 files changed, 609 insertions(+), 73 deletions(-)
 create mode 100644 drivers/gpu/nvgpu/common/mm/pd_cache.c

(limited to 'drivers')

diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index 3a256771..4aaf7bc5 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -50,6 +50,7 @@ nvgpu-y := \
 	common/mm/page_allocator.o \
 	common/mm/lockless_allocator.o \
 	common/mm/gmmu.o \
+	common/mm/pd_cache.o \
 	common/mm/vm.o \
 	common/mm/vm_area.o \
 	common/bus.o \
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index ec1bc095..602dfb3b 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -45,7 +45,8 @@ static int pd_allocate(struct vm_gk20a *vm,
 		       struct nvgpu_gmmu_pd *pd,
 		       const struct gk20a_mmu_level *l,
 		       struct nvgpu_gmmu_attrs *attrs);
-
+static u32 pd_size(const struct gk20a_mmu_level *l,
+		   struct nvgpu_gmmu_attrs *attrs);
 /*
  * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU
  * VA will be allocated for you. If addr is non-zero then the buffer will be
@@ -138,6 +139,9 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va)
 
 int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 {
+	u32 pdb_size;
+	int err;
+
 	/*
 	 * Need this just for page size. Everything else can be ignored. Also
 	 * note that we can just use pgsz 0 (i.e small pages) since the number
@@ -148,56 +152,43 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 		.pgsz = 0,
 	};
 
-	return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs);
-}
+	/*
+	 * PDB size here must be one page so that its address is page size
+	 * aligned. Although lower PDE tables can be aligned at 256B boundaries
+	 * the main PDB must be page aligned.
+	 */
+	pdb_size = ALIGN(pd_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE);
+
+	err = __nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size);
+	if (WARN_ON(err))
+		return err;
 
+	/*
+	 * One mb() is done after all mapping operations. Don't need individual
+	 * barriers for each PD write.
+	 */
+	vm->pdb.mem->skip_wmb = true;
+
+	return 0;
+}
 
 /*
  * Ensure that there's a CPU mapping for the page directory memory. This won't
  * always be the case for 32 bit systems since we may need to save kernel
  * virtual memory.
  */
-static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
+static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
 {
-	return nvgpu_mem_begin(g, &entry->mem);
+	return nvgpu_mem_begin(g, pd->mem);
 }
 
 /*
  * Handle any necessary CPU unmap semantics for a page directories DMA memory.
  * For 64 bit platforms this is a noop.
  */
-static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
-{
-	nvgpu_mem_end(g, &entry->mem);
-}
-
-static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes,
-				  struct nvgpu_gmmu_pd *pd)
-{
-	struct gk20a *g = gk20a_from_vm(vm);
-	unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS;
-	int err;
-
-	/*
-	 * On arm32 vmalloc space is a precious commodity so we do not map pages
-	 * by default.
-	 */
-	if (!IS_ENABLED(CONFIG_ARM64))
-		flags |= NVGPU_DMA_NO_KERNEL_MAPPING;
-
-	err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem);
-	if (err)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
-			   struct nvgpu_gmmu_pd *pd)
+static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
 {
-	struct gk20a *g = gk20a_from_vm(vm);
-
-	nvgpu_dma_free(g, &pd->mem);
+	nvgpu_mem_end(g, pd->mem);
 }
 
 /*
@@ -205,10 +196,14 @@ void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
  */
 u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
 {
+	u64 page_addr;
+
 	if (g->mm.has_physical_mode)
-		return sg_phys(pd->mem.priv.sgt->sgl);
+		page_addr = sg_phys(pd->mem->priv.sgt->sgl);
 	else
-		return nvgpu_mem_get_base_addr(g, &pd->mem, 0);
+		page_addr = nvgpu_mem_get_base_addr(g, pd->mem, 0);
+
+	return page_addr + pd->mem_offs;
 }
 
 /*
@@ -254,10 +249,10 @@ static int pd_allocate(struct vm_gk20a *vm,
 {
 	int err;
 
-	if (pd->mem.size)
+	if (pd->mem)
 		return 0;
 
-	err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd);
+	err = __nvgpu_pd_alloc(vm, pd, pd_size(l, attrs));
 	if (err) {
 		nvgpu_info(vm->mm->g, "error allocating page directory!");
 		return err;
@@ -267,7 +262,7 @@ static int pd_allocate(struct vm_gk20a *vm,
 	 * One mb() is done after all mapping operations. Don't need individual
 	 * barriers for each PD write.
 	 */
-	pd->mem.skip_wmb = true;
+	pd->mem->skip_wmb = true;
 
 	return 0;
 }
@@ -778,7 +773,7 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 	}
 
 	if (!batch)
-		g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
+		g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
 	else
 		batch->need_tlb_invalidate = true;
 
@@ -830,7 +825,7 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
 
 	if (!batch) {
 		gk20a_mm_l2_flush(g, true);
-		g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
+		g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
 	} else {
 		if (!batch->gpu_l2_flushed) {
 			gk20a_mm_l2_flush(g, true);
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c
new file mode 100644
index 00000000..4f312eff
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <nvgpu/log.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/log2.h>
+
+#include "gk20a/gk20a.h"
+#include "gk20a/mm_gk20a.h"
+
+#define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args)
+
+/**
+ * DOC: PD cache
+ *
+ * In the name of saving memory with the many sub-page sized PD levels in Pascal
+ * and beyond a way of packing PD tables together is necessary. This code here
+ * does just that. If a PD table only requires 1024 bytes, then it is possible
+ * to have 4 of these PDs in one page. This is even more pronounced for 256 byte
+ * PD tables.
+ *
+ * The pd cache is basially just a slab allocator. Each instance of the nvgpu
+ * driver makes one of these structs:
+ *
+ *   struct nvgpu_pd_cache {
+ *      struct nvgpu_list_node		 full[NVGPU_PD_CACHE_COUNT];
+ *      struct nvgpu_list_node		 partial[NVGPU_PD_CACHE_COUNT];
+ *
+ *      struct nvgpu_rbtree_node	*mem_tree;
+ *   };
+ *
+ * There are two sets of lists, the full and the partial. The full lists contain
+ * pages of memory for which all the memory in that page is in use. The partial
+ * lists contain partially full pages of memory which can be used for more PD
+ * allocations. There a couple of assumptions here:
+ *
+ *   1. PDs greater than or equal to the page size bypass the pd cache.
+ *   2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
+ *
+ * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
+ * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
+ * 256, 512, 1024, and 2048 byte PDs.
+ *
+ * __nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
+ * size is page size or larger and choose the correct allocation scheme - either
+ * from the PD cache or directly. Similarly __nvgpu_pd_free() will free a PD
+ * allocated by __nvgpu_pd_alloc().
+ *
+ * Since the top level PD (the PDB) is a page aligned pointer but less than a
+ * page size the direct functions must be used for allocating PDBs. Otherwise
+ * there would be alignment issues for the PDBs when they get packed.
+ */
+
+static u32 nvgpu_pd_cache_nr(u32 bytes)
+{
+	return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1));
+}
+
+static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
+{
+	u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
+
+	return mask_offset - 1;
+}
+
+int nvgpu_pd_cache_init(struct gk20a *g)
+{
+	struct nvgpu_pd_cache *cache;
+	int i;
+
+	/*
+	 * This gets called from finalize_poweron() so we need to make sure we
+	 * don't reinit the pd_cache over and over.
+	 */
+	if (g->mm.pd_cache)
+		return 0;
+
+	cache = nvgpu_kzalloc(g, sizeof(*cache));
+	if (!cache) {
+		nvgpu_err(g, "Failed to alloc pd_cache!");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
+		nvgpu_init_list_node(&cache->full[i]);
+		nvgpu_init_list_node(&cache->partial[i]);
+	}
+
+	cache->mem_tree = NULL;
+	g->mm.pd_cache = cache;
+	nvgpu_mutex_init(&cache->lock);
+
+	pd_dbg(g, "PD cache initialized!");
+
+	return 0;
+}
+
+void nvgpu_pd_cache_fini(struct gk20a *g)
+{
+	int i;
+	struct nvgpu_pd_cache *cache = g->mm.pd_cache;
+
+	if (!cache)
+		return;
+
+	for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
+		WARN_ON(!nvgpu_list_empty(&cache->full[i]));
+		WARN_ON(!nvgpu_list_empty(&cache->partial[i]));
+	}
+
+	nvgpu_kfree(g, g->mm.pd_cache);
+}
+
+/*
+ * This is the simple pass-through for greater than page or page sized PDs.
+ *
+ * Note: this does not need the cache lock since it does not modify any of the
+ * PD cache data structures.
+ */
+int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+				  struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	int err;
+
+	pd_dbg(g, "PD-Alloc [D] %u bytes", bytes);
+
+	pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem));
+	if (!pd->mem) {
+		pd_dbg(g, "OOM allocating nvgpu_mem struct!");
+		return -ENOMEM;
+	}
+
+	err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
+				    bytes, pd->mem);
+	if (err) {
+		pd_dbg(g, "OOM allocating page directory!");
+		nvgpu_kfree(g, pd->mem);
+		return -ENOMEM;
+	}
+
+	pd->cached = false;
+	pd->mem_offs = 0;
+
+	return 0;
+}
+
+/*
+ * Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed
+ * pd to reflect this allocation.
+ */
+static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
+				    struct nvgpu_pd_cache *cache,
+				    struct nvgpu_gmmu_pd *pd,
+				    u32 bytes)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+
+	pd_dbg(g, "PD-Alloc [C]   New: offs=0");
+
+	pentry = nvgpu_kzalloc(g, sizeof(*pentry));
+	if (!pentry) {
+		pd_dbg(g, "OOM allocating pentry!");
+		return -ENOMEM;
+	}
+
+	if (nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
+				  PAGE_SIZE, &pentry->mem)) {
+		nvgpu_kfree(g, pentry);
+		pd_dbg(g, "Unable to DMA alloc!");
+		return -ENOMEM;
+	}
+
+	pentry->pd_size = bytes;
+	nvgpu_list_add(&pentry->list_entry,
+		       &cache->partial[nvgpu_pd_cache_nr(bytes)]);
+
+	/*
+	 * This allocates the very first PD table in the set of tables in this
+	 * nvgpu_pd_mem_entry.
+	 */
+	pentry->alloc_map = 1;
+
+	/*
+	 * Now update the nvgpu_gmmu_pd to reflect this allocation.
+	 */
+	pd->mem = &pentry->mem;
+	pd->mem_offs = 0;
+	pd->cached = true;
+
+	pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem;
+	nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree);
+
+	return 0;
+}
+
+static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
+					     struct nvgpu_pd_cache *cache,
+					     struct nvgpu_pd_mem_entry *pentry,
+					     struct nvgpu_gmmu_pd *pd)
+{
+	unsigned long bit_offs;
+	u32 mem_offs;
+	u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
+
+	/*
+	 * Find and allocate an open PD.
+	 */
+	bit_offs = ffz(pentry->alloc_map);
+	mem_offs = bit_offs * pentry->pd_size;
+
+	/* Bit map full. Somethings wrong. */
+	if (WARN_ON(bit_offs >= ffz(pentry_mask)))
+		return -ENOMEM;
+
+	pentry->alloc_map |= 1 << bit_offs;
+
+	pd_dbg(g, "PD-Alloc [C]   Partial: offs=%lu", bit_offs);
+
+	/*
+	 * First update the pd.
+	 */
+	pd->mem = &pentry->mem;
+	pd->mem_offs = mem_offs;
+	pd->cached = true;
+
+	/*
+	 * Now make sure the pentry is in the correct list (full vs partial).
+	 */
+	if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
+		pd_dbg(g, "Adding pentry to full list!");
+		nvgpu_list_del(&pentry->list_entry);
+		nvgpu_list_add(&pentry->list_entry,
+			&cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]);
+	}
+
+	return 0;
+}
+
+/*
+ * Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial
+ * nvgpu_pd_mem_entry's.
+ */
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial(
+	struct nvgpu_pd_cache *cache, u32 bytes)
+{
+	struct nvgpu_list_node *list =
+		&cache->partial[nvgpu_pd_cache_nr(bytes)];
+
+	if (nvgpu_list_empty(list))
+		return NULL;
+
+	return nvgpu_list_first_entry(list,
+				      nvgpu_pd_mem_entry,
+				      list_entry);
+}
+
+/*
+ * Allocate memory from an nvgpu_mem for the page directory.
+ */
+static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
+				struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+	int err;
+
+	pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
+
+	if (bytes & (bytes - 1) ||
+	    (bytes >= PAGE_SIZE ||
+	     bytes < NVGPU_PD_CACHE_MIN)) {
+		pd_dbg(g, "PD-Alloc [C]   Invalid (bytes=%u)!", bytes);
+		return -EINVAL;
+	}
+
+	pentry = nvgpu_pd_cache_get_partial(cache, bytes);
+	if (!pentry)
+		err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes);
+	else
+		err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd);
+
+	if (err)
+		pd_dbg(g, "PD-Alloc [C] Failed!");
+
+	return err;
+}
+
+/*
+ * Allocate the DMA memory for a page directory. This handles the necessary PD
+ * cache logistics. Since on Parker and later GPUs some of the page  directories
+ * are smaller than a page packing these PDs together saves a lot of memory.
+ */
+int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	int err;
+
+	/*
+	 * Simple case: PD is bigger than a page so just do a regular DMA
+	 * alloc.
+	 */
+	if (bytes >= PAGE_SIZE) {
+		err = __nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+		if (err)
+			return err;
+
+		return 0;
+	}
+
+	if (WARN_ON(!g->mm.pd_cache))
+		return -ENOMEM;
+
+	nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+	err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes);
+	nvgpu_mutex_release(&g->mm.pd_cache->lock);
+
+	return err;
+}
+
+void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+{
+	pd_dbg(g, "PD-Free  [D] 0x%p", pd->mem);
+
+	if (!pd->mem)
+		return;
+
+	nvgpu_dma_free(g, pd->mem);
+	nvgpu_kfree(g, pd->mem);
+	pd->mem = NULL;
+}
+
+static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g,
+					  struct nvgpu_pd_cache *cache,
+					  struct nvgpu_pd_mem_entry *pentry)
+{
+	nvgpu_dma_free(g, &pentry->mem);
+	nvgpu_list_del(&pentry->list_entry);
+	nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree);
+	nvgpu_kfree(g, pentry);
+}
+
+static void nvgpu_pd_cache_do_free(struct gk20a *g,
+				   struct nvgpu_pd_cache *cache,
+				   struct nvgpu_pd_mem_entry *pentry,
+				   struct nvgpu_gmmu_pd *pd)
+{
+	u32 index = pd->mem_offs / pentry->pd_size;
+	u32 bit = 1 << index;
+
+	/* Mark entry as free. */
+	pentry->alloc_map &= ~bit;
+
+	if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
+		/*
+		 * Partially full still. If it was already on the partial list
+		 * this just re-adds it.
+		 */
+		nvgpu_list_del(&pentry->list_entry);
+		nvgpu_list_add(&pentry->list_entry,
+			&cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
+	} else {
+		/* Empty now so free it. */
+		nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
+	}
+}
+
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
+	struct gk20a *g,
+	struct nvgpu_pd_cache *cache,
+	struct nvgpu_gmmu_pd *pd)
+{
+	struct nvgpu_rbtree_node *node;
+
+	nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node,
+			    cache->mem_tree);
+	if (!node)
+		return NULL;
+
+	return nvgpu_pd_mem_entry_from_tree_entry(node);
+}
+
+static void nvgpu_pd_cache_free(struct gk20a *g, struct nvgpu_pd_cache *cache,
+				struct nvgpu_gmmu_pd *pd)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+
+	pd_dbg(g, "PD-Free  [C] 0x%p", pd->mem);
+
+	pentry = nvgpu_pd_cache_look_up(g, cache, pd);
+	if (!pentry) {
+		WARN(1, "Attempting to free non-existent pd");
+		return;
+	}
+
+	nvgpu_pd_cache_do_free(g, cache, pentry, pd);
+}
+
+void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	/*
+	 * Simple case: just DMA free.
+	 */
+	if (!pd->cached)
+		return __nvgpu_pd_cache_free_direct(g, pd);
+
+	nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+	nvgpu_pd_cache_free(g, g->mm.pd_cache, pd);
+	nvgpu_mutex_release(&g->mm.pd_cache->lock);
+}
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index 3aeba500..3ed3c7fe 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -35,21 +35,42 @@ int vm_aspace_id(struct vm_gk20a *vm)
 	return vm->as_share ? vm->as_share->id : -1;
 }
 
-static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
-				  struct nvgpu_gmmu_pd *parent,
-				  int level)
+static void __nvgpu_vm_free_entries(struct vm_gk20a *vm,
+				    struct nvgpu_gmmu_pd *pd,
+				    int level)
 {
 	int i;
 
-	if (parent->entries)
-		for (i = 0; i < parent->num_entries; i++)
-			nvgpu_vm_free_entries(vm, &parent->entries[i],
+	if (pd->mem) {
+		__nvgpu_pd_free(vm, pd);
+		pd->mem = NULL;
+	}
+
+	if (pd->entries) {
+		for (i = 0; i < pd->num_entries; i++)
+			__nvgpu_vm_free_entries(vm, &pd->entries[i],
 					      level + 1);
+		nvgpu_vfree(vm->mm->g, pd->entries);
+		pd->entries = NULL;
+	}
+}
+
+static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
+				  struct nvgpu_gmmu_pd *pdb)
+{
+	struct gk20a *g = vm->mm->g;
+	int i;
+
+	__nvgpu_pd_cache_free_direct(g, pdb);
+
+	if (!pdb->entries)
+		return;
+
+	for (i = 0; i < pdb->num_entries; i++)
+		__nvgpu_vm_free_entries(vm, &pdb->entries[i], 1);
 
-	if (parent->mem.size)
-		nvgpu_free_gmmu_pages(vm, parent);
-	nvgpu_vfree(vm->mm->g, parent->entries);
-	parent->entries = NULL;
+	nvgpu_vfree(g, pdb->entries);
+	pdb->entries = NULL;
 }
 
 u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
@@ -110,7 +131,7 @@ void nvgpu_vm_mapping_batch_finish_locked(
 
 	if (mapping_batch->need_tlb_invalidate) {
 		struct gk20a *g = gk20a_from_vm(vm);
-		g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
+		g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
 	}
 }
 
@@ -407,9 +428,8 @@ clean_up_allocators:
 	if (nvgpu_alloc_initialized(&vm->user_lp))
 		nvgpu_alloc_destroy(&vm->user_lp);
 clean_up_page_tables:
-	/* Cleans up nvgpu_vm_init_page_tables() */
-	nvgpu_vfree(g, vm->pdb.entries);
-	nvgpu_free_gmmu_pages(vm, &vm->pdb);
+	/* Cleans up nvgpu_gmmu_init_page_table() */
+	__nvgpu_pd_cache_free_direct(g, &vm->pdb);
 clean_up_vgpu_vm:
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
 	if (g->is_virtual)
@@ -525,7 +545,7 @@ static void __nvgpu_vm_remove(struct vm_gk20a *vm)
 	if (nvgpu_alloc_initialized(&vm->user_lp))
 		nvgpu_alloc_destroy(&vm->user_lp);
 
-	nvgpu_vm_free_entries(vm, &vm->pdb, 0);
+	nvgpu_vm_free_entries(vm, &vm->pdb);
 
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
 	if (g->is_virtual)
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 380c28ac..a0753770 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -25,6 +25,7 @@
 #include <nvgpu/soc.h>
 #include <nvgpu/enabled.h>
 #include <nvgpu/pmu.h>
+#include <nvgpu/gmmu.h>
 
 #include <trace/events/gk20a.h>
 
@@ -174,6 +175,14 @@ int gk20a_finalize_poweron(struct gk20a *g)
 		g->gpu_reset_done = true;
 	}
 
+	/*
+	 * Do this early so any early VMs that get made are capable of mapping
+	 * buffers.
+	 */
+	err = nvgpu_pd_cache_init(g);
+	if (err)
+		return err;
+
 	/* init interface layer support for PMU falcon */
 	nvgpu_flcn_sw_init(g, FALCON_ID_PMU);
 	nvgpu_flcn_sw_init(g, FALCON_ID_SEC2);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 558a1b06..0a84cabb 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -478,6 +478,7 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm)
 
 	gk20a_semaphore_sea_destroy(g);
 	gk20a_vidmem_destroy(g);
+	nvgpu_pd_cache_fini(g);
 }
 
 static int gk20a_alloc_sysmem_flush(struct gk20a *g)
@@ -1560,7 +1561,7 @@ static inline u32 big_valid_pde0_bits(struct gk20a *g,
 				      struct nvgpu_gmmu_pd *pd, u64 addr)
 {
 	u32 pde0_bits =
-		nvgpu_aperture_mask(g, &pd->mem,
+		nvgpu_aperture_mask(g, pd->mem,
 		  gmmu_pde_aperture_big_sys_mem_ncoh_f(),
 		  gmmu_pde_aperture_big_video_memory_f()) |
 		gmmu_pde_address_big_sys_f(
@@ -1573,7 +1574,7 @@ static inline u32 small_valid_pde1_bits(struct gk20a *g,
 					struct nvgpu_gmmu_pd *pd, u64 addr)
 {
 	u32 pde1_bits =
-		nvgpu_aperture_mask(g, &pd->mem,
+		nvgpu_aperture_mask(g, pd->mem,
 		  gmmu_pde_aperture_small_sys_mem_ncoh_f(),
 		  gmmu_pde_aperture_small_video_memory_f()) |
 		gmmu_pde_vol_small_true_f() | /* tbd: why? */
@@ -2173,14 +2174,14 @@ static int gk20a_init_ce_vm(struct mm_gk20a *mm)
 void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
 		struct vm_gk20a *vm)
 {
-	u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
+	u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0);
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);
 
 	gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
 
 	nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
-		nvgpu_aperture_mask(g, &vm->pdb.mem,
+		nvgpu_aperture_mask(g, vm->pdb.mem,
 		  ram_in_page_dir_base_target_sys_mem_ncoh_f(),
 		  ram_in_page_dir_base_target_vid_mem_f()) |
 		ram_in_page_dir_base_vol_true_f() |
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index a245d0e0..cadcffa4 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -31,6 +31,8 @@
 #include <nvgpu/rbtree.h>
 #include <nvgpu/kref.h>
 
+struct nvgpu_pd_cache;
+
 #ifdef CONFIG_ARM64
 #define outer_flush_range(a, b)
 #define __cpuc_flush_dcache_area __flush_dcache_area
@@ -217,6 +219,8 @@ struct mm_gk20a {
 		struct vm_gk20a *vm;
 	} ce;
 
+	struct nvgpu_pd_cache *pd_cache;
+
 	struct nvgpu_mutex l2_op_lock;
 	struct nvgpu_mutex tlb_lock;
 	struct nvgpu_mutex priv_lock;
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index c3867e9d..2ff199c6 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -164,7 +164,7 @@ static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
 
 	phys_addr >>= gmmu_new_pde_address_shift_v();
 
-	pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
+	pde_v[0] |= nvgpu_aperture_mask(g, pd->mem,
 			gmmu_new_pde_aperture_sys_mem_ncoh_f(),
 			gmmu_new_pde_aperture_video_memory_f());
 	pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
@@ -209,7 +209,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
 	if (small_valid) {
 		pde_v[2] |=
 			gmmu_new_dual_pde_address_small_sys_f(small_addr);
-		pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem,
+		pde_v[2] |= nvgpu_aperture_mask(g, pd->mem,
 			gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
 			gmmu_new_dual_pde_aperture_small_video_memory_f());
 		pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f();
@@ -219,7 +219,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
 	if (big_valid) {
 		pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr);
 		pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f();
-		pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
+		pde_v[0] |= nvgpu_aperture_mask(g, pd->mem,
 			gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
 			gmmu_new_dual_pde_aperture_big_video_memory_f());
 		pde_v[1] |= big_addr >> 28;
@@ -365,14 +365,14 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
 static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
 		struct vm_gk20a *vm)
 {
-	u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
+	u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0);
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);
 
 	gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
 
 	nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
-		nvgpu_aperture_mask(g, &vm->pdb.mem,
+		nvgpu_aperture_mask(g, vm->pdb.mem,
 		  ram_in_page_dir_base_target_sys_mem_ncoh_f(),
 		  ram_in_page_dir_base_target_vid_mem_f()) |
 		ram_in_page_dir_base_vol_true_f() |
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index 28a2cb82..eff87c31 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -19,6 +19,9 @@
 
 #include <nvgpu/types.h>
 #include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/rbtree.h>
+#include <nvgpu/lock.h>
 
 struct scatterlist;
 
@@ -44,15 +47,86 @@ enum gk20a_mem_rw_flag {
 	gk20a_mem_flag_write_only = 2,	/* WO */
 };
 
+/*
+ * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
+ * structure is of course depending on this. The MIN_SHIFT define is the right
+ * number of bits to shift to determine which list to use in the array of lists.
+ */
+#define NVGPU_PD_CACHE_MIN		256
+#define NVGPU_PD_CACHE_MIN_SHIFT	9
+#define NVGPU_PD_CACHE_COUNT		4
+
+struct nvgpu_pd_mem_entry {
+	struct nvgpu_mem		mem;
+
+	/*
+	 * Size of the page directories (not the mem). bmap is a bitmap showing
+	 * which PDs have been allocated. The size of mem will always be one
+	 * page. pd_size will always be a power of 2.
+	 */
+	u32				pd_size;
+	unsigned long			alloc_map;
+
+	struct nvgpu_list_node		list_entry;
+	struct nvgpu_rbtree_node	tree_entry;
+};
+
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_pd_mem_entry *)
+		((uintptr_t)node -
+		 offsetof(struct nvgpu_pd_mem_entry, list_entry));
+};
+
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
+{
+	return (struct nvgpu_pd_mem_entry *)
+		((uintptr_t)node -
+		 offsetof(struct nvgpu_pd_mem_entry, tree_entry));
+};
+
+/*
+ * A cache for allocating PD memory from. This enables smaller PDs to be packed
+ * into single pages.
+ *
+ * This is fairly complex so see the documentation in pd_cache.c for a full
+ * description of how this is organized.
+ */
+struct nvgpu_pd_cache {
+	/*
+	 * Array of lists of full nvgpu_pd_mem_entries and partially full (or
+	 * empty) nvgpu_pd_mem_entries.
+	 */
+	struct nvgpu_list_node		 full[NVGPU_PD_CACHE_COUNT];
+	struct nvgpu_list_node		 partial[NVGPU_PD_CACHE_COUNT];
+
+	/*
+	 * Tree of all allocated struct nvgpu_mem's for fast look up.
+	 */
+	struct nvgpu_rbtree_node	*mem_tree;
+
+	/*
+	 * All access to the cache much be locked. This protects the lists and
+	 * the rb tree.
+	 */
+	struct nvgpu_mutex		 lock;
+};
+
 /*
  * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
  * in the GMMU.
  */
 struct nvgpu_gmmu_pd {
 	/*
-	 * DMA memory describing the PTEs or PTEs.
+	 * DMA memory describing the PTEs or PDEs. @mem_offs describes the
+	 * offset of the PDE table in @mem. @cached specifies if this PD is
+	 * using pd_cache memory.
 	 */
-	struct nvgpu_mem	 mem;
+	struct nvgpu_mem	*mem;
+	u32			 mem_offs;
+	bool			 cached;
 
 	/*
 	 * List of pointers to the next level of page tables. Does not
@@ -66,7 +140,7 @@ struct nvgpu_gmmu_pd {
  * Reduce the number of arguments getting passed through the various levels of
  * GMMU mapping functions.
  *
- * The following fields are set statically and do not change throughout
+ * The following fields are set statically and do not change throughout the
  * mapping call:
  *
  *   pgsz:        Index into the page size table.
@@ -166,8 +240,13 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
 		      struct nvgpu_mem *mem,
 		      u64 gpu_va);
 
-void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
-		     struct nvgpu_gmmu_pd *entry);
+int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes);
+void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd);
+int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+				  struct nvgpu_gmmu_pd *pd, u32 bytes);
+void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd);
+int nvgpu_pd_cache_init(struct gk20a *g);
+void nvgpu_pd_cache_fini(struct gk20a *g);
 
 /*
  * Some useful routines that are shared across chips.
@@ -181,7 +260,7 @@ static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l,
 static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
 			    size_t w, size_t data)
 {
-	nvgpu_mem_wr32(g, &pd->mem, w, data);
+	nvgpu_mem_wr32(g, pd->mem, (pd->mem_offs / sizeof(u32)) + w, data);
 }
 
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h
index 3b8e6b19..a1110a59 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/log.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/log.h
@@ -68,6 +68,7 @@ enum nvgpu_log_categories {
 	gpu_dbg_xv         = BIT(17),	/* XVE debugging. */
 	gpu_dbg_shutdown   = BIT(18),	/* GPU shutdown tracing. */
 	gpu_dbg_kmem       = BIT(19),	/* Kmem tracking debugging. */
+	gpu_dbg_pd_cache   = BIT(20),	/* PD cache traces. */
 	gpu_dbg_mem        = BIT(31),	/* memory accesses; very verbose. */
 };
 
-- 
cgit v1.2.2