From 583704620db88e391f6b14acc57af859a70127de Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Fri, 9 Jun 2017 11:42:50 -0700
Subject: gpu: nvgpu: Implement PD packing

In some cases page directories require less than a full page of memory.
For example, on Pascal, the final PD level for large pages is only 256 bytes;
thus 16 PDs can fit in a single page. To allocate an entire page for each of
these 256 B PDs is extremely wasteful. This patch aims to alleviate the
wasted DMA memory from having small PDs in a full page by packing multiple
small PDs into a single page.

The packing is implemented as a slab allocator - each page is a slab and
from each page multiple PD instances can be allocated. Several modifications
to the nvgpu_gmmu_pd struct also needed to be made to support this. The
nvgpu_mem is now a pointer and there's an explicit offset into the nvgpu_mem
struct so that each nvgpu_gmmu_pd knows what portion of the memory it's
using.

The nvgpu_pde_phys_addr() function and the pd_write() functions also require
some changes since the PD no longer is always situated at the start of the
nvgpu_mem.

Initialization and cleanup of the page tables for each VM was slightly
modified to work through the new pd_cache implementation. Some PDs (i.e
the PDB), despite not being a full page, still require a full page for
alignment purposes (HW requirements). Thus a direct allocation method for
PDs is still provided. This is also used when a PD that could in principle
be cached is greater than a page in size.

Lastly a new debug flag was added for the pd_cache code.

JIRA NVGPU-30

Change-Id: I64c8037fc356783c1ef203cc143c4d71bbd5d77c
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: https://git-master/r/1506610
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
GVS: Gerrit_Virtual_Submit
---
 drivers/gpu/nvgpu/common/mm/gmmu.c     |  81 +++----
 drivers/gpu/nvgpu/common/mm/pd_cache.c | 426 +++++++++++++++++++++++++++++++++
 drivers/gpu/nvgpu/common/mm/vm.c       |  50 ++--
 3 files changed, 499 insertions(+), 58 deletions(-)
 create mode 100644 drivers/gpu/nvgpu/common/mm/pd_cache.c

(limited to 'drivers/gpu/nvgpu/common')

diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index ec1bc095..602dfb3b 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -45,7 +45,8 @@ static int pd_allocate(struct vm_gk20a *vm,
 		       struct nvgpu_gmmu_pd *pd,
 		       const struct gk20a_mmu_level *l,
 		       struct nvgpu_gmmu_attrs *attrs);
-
+static u32 pd_size(const struct gk20a_mmu_level *l,
+		   struct nvgpu_gmmu_attrs *attrs);
 /*
  * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU
  * VA will be allocated for you. If addr is non-zero then the buffer will be
@@ -138,6 +139,9 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va)
 
 int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 {
+	u32 pdb_size;
+	int err;
+
 	/*
 	 * Need this just for page size. Everything else can be ignored. Also
 	 * note that we can just use pgsz 0 (i.e small pages) since the number
@@ -148,56 +152,43 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 		.pgsz = 0,
 	};
 
-	return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs);
-}
+	/*
+	 * PDB size here must be one page so that its address is page size
+	 * aligned. Although lower PDE tables can be aligned at 256B boundaries
+	 * the main PDB must be page aligned.
+	 */
+	pdb_size = ALIGN(pd_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE);
+
+	err = __nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size);
+	if (WARN_ON(err))
+		return err;
 
+	/*
+	 * One mb() is done after all mapping operations. Don't need individual
+	 * barriers for each PD write.
+	 */
+	vm->pdb.mem->skip_wmb = true;
+
+	return 0;
+}
 
 /*
  * Ensure that there's a CPU mapping for the page directory memory. This won't
  * always be the case for 32 bit systems since we may need to save kernel
  * virtual memory.
  */
-static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
+static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
 {
-	return nvgpu_mem_begin(g, &entry->mem);
+	return nvgpu_mem_begin(g, pd->mem);
 }
 
 /*
  * Handle any necessary CPU unmap semantics for a page directories DMA memory.
  * For 64 bit platforms this is a noop.
  */
-static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
-{
-	nvgpu_mem_end(g, &entry->mem);
-}
-
-static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes,
-				  struct nvgpu_gmmu_pd *pd)
-{
-	struct gk20a *g = gk20a_from_vm(vm);
-	unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS;
-	int err;
-
-	/*
-	 * On arm32 vmalloc space is a precious commodity so we do not map pages
-	 * by default.
-	 */
-	if (!IS_ENABLED(CONFIG_ARM64))
-		flags |= NVGPU_DMA_NO_KERNEL_MAPPING;
-
-	err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem);
-	if (err)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
-			   struct nvgpu_gmmu_pd *pd)
+static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
 {
-	struct gk20a *g = gk20a_from_vm(vm);
-
-	nvgpu_dma_free(g, &pd->mem);
+	nvgpu_mem_end(g, pd->mem);
 }
 
 /*
@@ -205,10 +196,14 @@ void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
  */
 u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
 {
+	u64 page_addr;
+
 	if (g->mm.has_physical_mode)
-		return sg_phys(pd->mem.priv.sgt->sgl);
+		page_addr = sg_phys(pd->mem->priv.sgt->sgl);
 	else
-		return nvgpu_mem_get_base_addr(g, &pd->mem, 0);
+		page_addr = nvgpu_mem_get_base_addr(g, pd->mem, 0);
+
+	return page_addr + pd->mem_offs;
 }
 
 /*
@@ -254,10 +249,10 @@ static int pd_allocate(struct vm_gk20a *vm,
 {
 	int err;
 
-	if (pd->mem.size)
+	if (pd->mem)
 		return 0;
 
-	err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd);
+	err = __nvgpu_pd_alloc(vm, pd, pd_size(l, attrs));
 	if (err) {
 		nvgpu_info(vm->mm->g, "error allocating page directory!");
 		return err;
@@ -267,7 +262,7 @@ static int pd_allocate(struct vm_gk20a *vm,
 	 * One mb() is done after all mapping operations. Don't need individual
 	 * barriers for each PD write.
 	 */
-	pd->mem.skip_wmb = true;
+	pd->mem->skip_wmb = true;
 
 	return 0;
 }
@@ -778,7 +773,7 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 	}
 
 	if (!batch)
-		g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
+		g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
 	else
 		batch->need_tlb_invalidate = true;
 
@@ -830,7 +825,7 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
 
 	if (!batch) {
 		gk20a_mm_l2_flush(g, true);
-		g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
+		g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
 	} else {
 		if (!batch->gpu_l2_flushed) {
 			gk20a_mm_l2_flush(g, true);
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c
new file mode 100644
index 00000000..4f312eff
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <nvgpu/log.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/log2.h>
+
+#include "gk20a/gk20a.h"
+#include "gk20a/mm_gk20a.h"
+
+#define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args)
+
+/**
+ * DOC: PD cache
+ *
+ * In the name of saving memory with the many sub-page sized PD levels in Pascal
+ * and beyond a way of packing PD tables together is necessary. This code here
+ * does just that. If a PD table only requires 1024 bytes, then it is possible
+ * to have 4 of these PDs in one page. This is even more pronounced for 256 byte
+ * PD tables.
+ *
+ * The pd cache is basially just a slab allocator. Each instance of the nvgpu
+ * driver makes one of these structs:
+ *
+ *   struct nvgpu_pd_cache {
+ *      struct nvgpu_list_node		 full[NVGPU_PD_CACHE_COUNT];
+ *      struct nvgpu_list_node		 partial[NVGPU_PD_CACHE_COUNT];
+ *
+ *      struct nvgpu_rbtree_node	*mem_tree;
+ *   };
+ *
+ * There are two sets of lists, the full and the partial. The full lists contain
+ * pages of memory for which all the memory in that page is in use. The partial
+ * lists contain partially full pages of memory which can be used for more PD
+ * allocations. There a couple of assumptions here:
+ *
+ *   1. PDs greater than or equal to the page size bypass the pd cache.
+ *   2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
+ *
+ * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
+ * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
+ * 256, 512, 1024, and 2048 byte PDs.
+ *
+ * __nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
+ * size is page size or larger and choose the correct allocation scheme - either
+ * from the PD cache or directly. Similarly __nvgpu_pd_free() will free a PD
+ * allocated by __nvgpu_pd_alloc().
+ *
+ * Since the top level PD (the PDB) is a page aligned pointer but less than a
+ * page size the direct functions must be used for allocating PDBs. Otherwise
+ * there would be alignment issues for the PDBs when they get packed.
+ */
+
+static u32 nvgpu_pd_cache_nr(u32 bytes)
+{
+	return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1));
+}
+
+static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
+{
+	u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
+
+	return mask_offset - 1;
+}
+
+int nvgpu_pd_cache_init(struct gk20a *g)
+{
+	struct nvgpu_pd_cache *cache;
+	int i;
+
+	/*
+	 * This gets called from finalize_poweron() so we need to make sure we
+	 * don't reinit the pd_cache over and over.
+	 */
+	if (g->mm.pd_cache)
+		return 0;
+
+	cache = nvgpu_kzalloc(g, sizeof(*cache));
+	if (!cache) {
+		nvgpu_err(g, "Failed to alloc pd_cache!");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
+		nvgpu_init_list_node(&cache->full[i]);
+		nvgpu_init_list_node(&cache->partial[i]);
+	}
+
+	cache->mem_tree = NULL;
+	g->mm.pd_cache = cache;
+	nvgpu_mutex_init(&cache->lock);
+
+	pd_dbg(g, "PD cache initialized!");
+
+	return 0;
+}
+
+void nvgpu_pd_cache_fini(struct gk20a *g)
+{
+	int i;
+	struct nvgpu_pd_cache *cache = g->mm.pd_cache;
+
+	if (!cache)
+		return;
+
+	for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
+		WARN_ON(!nvgpu_list_empty(&cache->full[i]));
+		WARN_ON(!nvgpu_list_empty(&cache->partial[i]));
+	}
+
+	nvgpu_kfree(g, g->mm.pd_cache);
+}
+
+/*
+ * This is the simple pass-through for greater than page or page sized PDs.
+ *
+ * Note: this does not need the cache lock since it does not modify any of the
+ * PD cache data structures.
+ */
+int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+				  struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	int err;
+
+	pd_dbg(g, "PD-Alloc [D] %u bytes", bytes);
+
+	pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem));
+	if (!pd->mem) {
+		pd_dbg(g, "OOM allocating nvgpu_mem struct!");
+		return -ENOMEM;
+	}
+
+	err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
+				    bytes, pd->mem);
+	if (err) {
+		pd_dbg(g, "OOM allocating page directory!");
+		nvgpu_kfree(g, pd->mem);
+		return -ENOMEM;
+	}
+
+	pd->cached = false;
+	pd->mem_offs = 0;
+
+	return 0;
+}
+
+/*
+ * Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed
+ * pd to reflect this allocation.
+ */
+static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
+				    struct nvgpu_pd_cache *cache,
+				    struct nvgpu_gmmu_pd *pd,
+				    u32 bytes)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+
+	pd_dbg(g, "PD-Alloc [C]   New: offs=0");
+
+	pentry = nvgpu_kzalloc(g, sizeof(*pentry));
+	if (!pentry) {
+		pd_dbg(g, "OOM allocating pentry!");
+		return -ENOMEM;
+	}
+
+	if (nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
+				  PAGE_SIZE, &pentry->mem)) {
+		nvgpu_kfree(g, pentry);
+		pd_dbg(g, "Unable to DMA alloc!");
+		return -ENOMEM;
+	}
+
+	pentry->pd_size = bytes;
+	nvgpu_list_add(&pentry->list_entry,
+		       &cache->partial[nvgpu_pd_cache_nr(bytes)]);
+
+	/*
+	 * This allocates the very first PD table in the set of tables in this
+	 * nvgpu_pd_mem_entry.
+	 */
+	pentry->alloc_map = 1;
+
+	/*
+	 * Now update the nvgpu_gmmu_pd to reflect this allocation.
+	 */
+	pd->mem = &pentry->mem;
+	pd->mem_offs = 0;
+	pd->cached = true;
+
+	pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem;
+	nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree);
+
+	return 0;
+}
+
+static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
+					     struct nvgpu_pd_cache *cache,
+					     struct nvgpu_pd_mem_entry *pentry,
+					     struct nvgpu_gmmu_pd *pd)
+{
+	unsigned long bit_offs;
+	u32 mem_offs;
+	u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
+
+	/*
+	 * Find and allocate an open PD.
+	 */
+	bit_offs = ffz(pentry->alloc_map);
+	mem_offs = bit_offs * pentry->pd_size;
+
+	/* Bit map full. Somethings wrong. */
+	if (WARN_ON(bit_offs >= ffz(pentry_mask)))
+		return -ENOMEM;
+
+	pentry->alloc_map |= 1 << bit_offs;
+
+	pd_dbg(g, "PD-Alloc [C]   Partial: offs=%lu", bit_offs);
+
+	/*
+	 * First update the pd.
+	 */
+	pd->mem = &pentry->mem;
+	pd->mem_offs = mem_offs;
+	pd->cached = true;
+
+	/*
+	 * Now make sure the pentry is in the correct list (full vs partial).
+	 */
+	if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
+		pd_dbg(g, "Adding pentry to full list!");
+		nvgpu_list_del(&pentry->list_entry);
+		nvgpu_list_add(&pentry->list_entry,
+			&cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]);
+	}
+
+	return 0;
+}
+
+/*
+ * Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial
+ * nvgpu_pd_mem_entry's.
+ */
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial(
+	struct nvgpu_pd_cache *cache, u32 bytes)
+{
+	struct nvgpu_list_node *list =
+		&cache->partial[nvgpu_pd_cache_nr(bytes)];
+
+	if (nvgpu_list_empty(list))
+		return NULL;
+
+	return nvgpu_list_first_entry(list,
+				      nvgpu_pd_mem_entry,
+				      list_entry);
+}
+
+/*
+ * Allocate memory from an nvgpu_mem for the page directory.
+ */
+static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
+				struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+	int err;
+
+	pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
+
+	if (bytes & (bytes - 1) ||
+	    (bytes >= PAGE_SIZE ||
+	     bytes < NVGPU_PD_CACHE_MIN)) {
+		pd_dbg(g, "PD-Alloc [C]   Invalid (bytes=%u)!", bytes);
+		return -EINVAL;
+	}
+
+	pentry = nvgpu_pd_cache_get_partial(cache, bytes);
+	if (!pentry)
+		err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes);
+	else
+		err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd);
+
+	if (err)
+		pd_dbg(g, "PD-Alloc [C] Failed!");
+
+	return err;
+}
+
+/*
+ * Allocate the DMA memory for a page directory. This handles the necessary PD
+ * cache logistics. Since on Parker and later GPUs some of the page  directories
+ * are smaller than a page packing these PDs together saves a lot of memory.
+ */
+int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	int err;
+
+	/*
+	 * Simple case: PD is bigger than a page so just do a regular DMA
+	 * alloc.
+	 */
+	if (bytes >= PAGE_SIZE) {
+		err = __nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+		if (err)
+			return err;
+
+		return 0;
+	}
+
+	if (WARN_ON(!g->mm.pd_cache))
+		return -ENOMEM;
+
+	nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+	err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes);
+	nvgpu_mutex_release(&g->mm.pd_cache->lock);
+
+	return err;
+}
+
+void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+{
+	pd_dbg(g, "PD-Free  [D] 0x%p", pd->mem);
+
+	if (!pd->mem)
+		return;
+
+	nvgpu_dma_free(g, pd->mem);
+	nvgpu_kfree(g, pd->mem);
+	pd->mem = NULL;
+}
+
+static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g,
+					  struct nvgpu_pd_cache *cache,
+					  struct nvgpu_pd_mem_entry *pentry)
+{
+	nvgpu_dma_free(g, &pentry->mem);
+	nvgpu_list_del(&pentry->list_entry);
+	nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree);
+	nvgpu_kfree(g, pentry);
+}
+
+static void nvgpu_pd_cache_do_free(struct gk20a *g,
+				   struct nvgpu_pd_cache *cache,
+				   struct nvgpu_pd_mem_entry *pentry,
+				   struct nvgpu_gmmu_pd *pd)
+{
+	u32 index = pd->mem_offs / pentry->pd_size;
+	u32 bit = 1 << index;
+
+	/* Mark entry as free. */
+	pentry->alloc_map &= ~bit;
+
+	if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
+		/*
+		 * Partially full still. If it was already on the partial list
+		 * this just re-adds it.
+		 */
+		nvgpu_list_del(&pentry->list_entry);
+		nvgpu_list_add(&pentry->list_entry,
+			&cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
+	} else {
+		/* Empty now so free it. */
+		nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
+	}
+}
+
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
+	struct gk20a *g,
+	struct nvgpu_pd_cache *cache,
+	struct nvgpu_gmmu_pd *pd)
+{
+	struct nvgpu_rbtree_node *node;
+
+	nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node,
+			    cache->mem_tree);
+	if (!node)
+		return NULL;
+
+	return nvgpu_pd_mem_entry_from_tree_entry(node);
+}
+
+static void nvgpu_pd_cache_free(struct gk20a *g, struct nvgpu_pd_cache *cache,
+				struct nvgpu_gmmu_pd *pd)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+
+	pd_dbg(g, "PD-Free  [C] 0x%p", pd->mem);
+
+	pentry = nvgpu_pd_cache_look_up(g, cache, pd);
+	if (!pentry) {
+		WARN(1, "Attempting to free non-existent pd");
+		return;
+	}
+
+	nvgpu_pd_cache_do_free(g, cache, pentry, pd);
+}
+
+void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	/*
+	 * Simple case: just DMA free.
+	 */
+	if (!pd->cached)
+		return __nvgpu_pd_cache_free_direct(g, pd);
+
+	nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+	nvgpu_pd_cache_free(g, g->mm.pd_cache, pd);
+	nvgpu_mutex_release(&g->mm.pd_cache->lock);
+}
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index 3aeba500..3ed3c7fe 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -35,21 +35,42 @@ int vm_aspace_id(struct vm_gk20a *vm)
 	return vm->as_share ? vm->as_share->id : -1;
 }
 
-static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
-				  struct nvgpu_gmmu_pd *parent,
-				  int level)
+static void __nvgpu_vm_free_entries(struct vm_gk20a *vm,
+				    struct nvgpu_gmmu_pd *pd,
+				    int level)
 {
 	int i;
 
-	if (parent->entries)
-		for (i = 0; i < parent->num_entries; i++)
-			nvgpu_vm_free_entries(vm, &parent->entries[i],
+	if (pd->mem) {
+		__nvgpu_pd_free(vm, pd);
+		pd->mem = NULL;
+	}
+
+	if (pd->entries) {
+		for (i = 0; i < pd->num_entries; i++)
+			__nvgpu_vm_free_entries(vm, &pd->entries[i],
 					      level + 1);
+		nvgpu_vfree(vm->mm->g, pd->entries);
+		pd->entries = NULL;
+	}
+}
+
+static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
+				  struct nvgpu_gmmu_pd *pdb)
+{
+	struct gk20a *g = vm->mm->g;
+	int i;
+
+	__nvgpu_pd_cache_free_direct(g, pdb);
+
+	if (!pdb->entries)
+		return;
+
+	for (i = 0; i < pdb->num_entries; i++)
+		__nvgpu_vm_free_entries(vm, &pdb->entries[i], 1);
 
-	if (parent->mem.size)
-		nvgpu_free_gmmu_pages(vm, parent);
-	nvgpu_vfree(vm->mm->g, parent->entries);
-	parent->entries = NULL;
+	nvgpu_vfree(g, pdb->entries);
+	pdb->entries = NULL;
 }
 
 u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
@@ -110,7 +131,7 @@ void nvgpu_vm_mapping_batch_finish_locked(
 
 	if (mapping_batch->need_tlb_invalidate) {
 		struct gk20a *g = gk20a_from_vm(vm);
-		g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
+		g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
 	}
 }
 
@@ -407,9 +428,8 @@ clean_up_allocators:
 	if (nvgpu_alloc_initialized(&vm->user_lp))
 		nvgpu_alloc_destroy(&vm->user_lp);
 clean_up_page_tables:
-	/* Cleans up nvgpu_vm_init_page_tables() */
-	nvgpu_vfree(g, vm->pdb.entries);
-	nvgpu_free_gmmu_pages(vm, &vm->pdb);
+	/* Cleans up nvgpu_gmmu_init_page_table() */
+	__nvgpu_pd_cache_free_direct(g, &vm->pdb);
 clean_up_vgpu_vm:
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
 	if (g->is_virtual)
@@ -525,7 +545,7 @@ static void __nvgpu_vm_remove(struct vm_gk20a *vm)
 	if (nvgpu_alloc_initialized(&vm->user_lp))
 		nvgpu_alloc_destroy(&vm->user_lp);
 
-	nvgpu_vm_free_entries(vm, &vm->pdb, 0);
+	nvgpu_vm_free_entries(vm, &vm->pdb);
 
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
 	if (g->is_virtual)
-- 
cgit v1.2.2