10 files changed, 609 insertions, 73 deletions
diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index 3a256771..4aaf7bc5 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -50,6 +50,7 @@ nvgpu-y := \
        common/mm/page_allocator.o \
        common/mm/lockless_allocator.o \
        common/mm/gmmu.o \
+        common/mm/pd_cache.o \
        common/mm/vm.o \
        common/mm/vm_area.o \
        common/bus.o \
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index ec1bc095..602dfb3b 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -45,7 +45,8 @@ static int pd_allocate(struct vm_gk20a *vm,
                       struct nvgpu_gmmu_pd *pd,
                       const struct gk20a_mmu_level *l,
                       struct nvgpu_gmmu_attrs *attrs);
+static u32 pd_size(const struct gk20a_mmu_level *l,
+                   struct nvgpu_gmmu_attrs *attrs);
 /*
 * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU
 * VA will be allocated for you. If addr is non-zero then the buffer will be
@@ -138,6 +139,9 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va)
 int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 {
+        u32 pdb_size;
+        int err;
        /*
         * Need this just for page size. Everything else can be ignored. Also
         * note that we can just use pgsz 0 (i.e small pages) since the number
@@ -148,56 +152,43 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
                .pgsz = 0,
        };
-        return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs);
+        /*
-}
+         * PDB size here must be one page so that its address is page size
+         * aligned. Although lower PDE tables can be aligned at 256B boundaries
+         * the main PDB must be page aligned.
+         */
+        pdb_size = ALIGN(pd_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE);
+        err = __nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size);
+        if (WARN_ON(err))
+                return err;
+        /*
+         * One mb() is done after all mapping operations. Don't need individual
+         * barriers for each PD write.
+         */
+        vm->pdb.mem->skip_wmb = true;
+        return 0;
+}
 /*
 * Ensure that there's a CPU mapping for the page directory memory. This won't
 * always be the case for 32 bit systems since we may need to save kernel
 * virtual memory.
 */
-static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
+static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
 {
-        return nvgpu_mem_begin(g, &entry->mem);
+        return nvgpu_mem_begin(g, pd->mem);
 }
 /*
 * Handle any necessary CPU unmap semantics for a page directories DMA memory.
 * For 64 bit platforms this is a noop.
 */
-static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
+static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
-{
-        nvgpu_mem_end(g, &entry->mem);
-}
-static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes,
-                                  struct nvgpu_gmmu_pd *pd)
-{
-        struct gk20a *g = gk20a_from_vm(vm);
-        unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS;
-        int err;
-        /*
-         * On arm32 vmalloc space is a precious commodity so we do not map pages
-         * by default.
-         */
-        if (!IS_ENABLED(CONFIG_ARM64))
-                flags |= NVGPU_DMA_NO_KERNEL_MAPPING;
-        err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem);
-        if (err)
-                return -ENOMEM;
-        return 0;
-}
-void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
-                           struct nvgpu_gmmu_pd *pd)
 {
-        struct gk20a *g = gk20a_from_vm(vm);
+        nvgpu_mem_end(g, pd->mem);
-        nvgpu_dma_free(g, &pd->mem);
 }
 /*
@@ -205,10 +196,14 @@ void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
 */
 u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
 {
+        u64 page_addr;
        if (g->mm.has_physical_mode)
-                return sg_phys(pd->mem.priv.sgt->sgl);
+                page_addr = sg_phys(pd->mem->priv.sgt->sgl);
        else
-                return nvgpu_mem_get_base_addr(g, &pd->mem, 0);
+                page_addr = nvgpu_mem_get_base_addr(g, pd->mem, 0);
+        return page_addr + pd->mem_offs;
 }
 /*
@@ -254,10 +249,10 @@ static int pd_allocate(struct vm_gk20a *vm,
 {
        int err;
-        if (pd->mem.size)
+        if (pd->mem)
                return 0;
-        err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd);
+        err = __nvgpu_pd_alloc(vm, pd, pd_size(l, attrs));
        if (err) {
                nvgpu_info(vm->mm->g, "error allocating page directory!");
                return err;
@@ -267,7 +262,7 @@ static int pd_allocate(struct vm_gk20a *vm,
         * One mb() is done after all mapping operations. Don't need individual
         * barriers for each PD write.
         */
-        pd->mem.skip_wmb = true;
+        pd->mem->skip_wmb = true;
        return 0;
 }
@@ -778,7 +773,7 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
        }
        if (!batch)
-                g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
+                g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
        else
                batch->need_tlb_invalidate = true;
@@ -830,7 +825,7 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
        if (!batch) {
                gk20a_mm_l2_flush(g, true);
-                g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
+                g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
        } else {
                if (!batch->gpu_l2_flushed) {
                        gk20a_mm_l2_flush(g, true);
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c
new file mode 100644
index 00000000..4f312eff
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <nvgpu/log.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/log2.h>
+#include "gk20a/gk20a.h"
+#include "gk20a/mm_gk20a.h"
+#define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args)
+/**
+ * DOC: PD cache
+ *
+ * In the name of saving memory with the many sub-page sized PD levels in Pascal
+ * and beyond a way of packing PD tables together is necessary. This code here
+ * does just that. If a PD table only requires 1024 bytes, then it is possible
+ * to have 4 of these PDs in one page. This is even more pronounced for 256 byte
+ * PD tables.
+ *
+ * The pd cache is basially just a slab allocator. Each instance of the nvgpu
+ * driver makes one of these structs:
+ *
+ *   struct nvgpu_pd_cache {
+ *      struct nvgpu_list_node           full[NVGPU_PD_CACHE_COUNT];
+ *      struct nvgpu_list_node           partial[NVGPU_PD_CACHE_COUNT];
+ *
+ *      struct nvgpu_rbtree_node        *mem_tree;
+ *   };
+ *
+ * There are two sets of lists, the full and the partial. The full lists contain
+ * pages of memory for which all the memory in that page is in use. The partial
+ * lists contain partially full pages of memory which can be used for more PD
+ * allocations. There a couple of assumptions here:
+ *
+ *   1. PDs greater than or equal to the page size bypass the pd cache.
+ *   2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
+ *
+ * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
+ * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
+ * 256, 512, 1024, and 2048 byte PDs.
+ *
+ * __nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
+ * size is page size or larger and choose the correct allocation scheme - either
+ * from the PD cache or directly. Similarly __nvgpu_pd_free() will free a PD
+ * allocated by __nvgpu_pd_alloc().
+ *
+ * Since the top level PD (the PDB) is a page aligned pointer but less than a
+ * page size the direct functions must be used for allocating PDBs. Otherwise
+ * there would be alignment issues for the PDBs when they get packed.
+ */
+static u32 nvgpu_pd_cache_nr(u32 bytes)
+{
+        return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1));
+}
+static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
+{
+        u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
+        return mask_offset - 1;
+}
+int nvgpu_pd_cache_init(struct gk20a *g)
+{
+        struct nvgpu_pd_cache *cache;
+        int i;
+        /*
+         * This gets called from finalize_poweron() so we need to make sure we
+         * don't reinit the pd_cache over and over.
+         */
+        if (g->mm.pd_cache)
+                return 0;
+        cache = nvgpu_kzalloc(g, sizeof(*cache));
+        if (!cache) {
+                nvgpu_err(g, "Failed to alloc pd_cache!");
+                return -ENOMEM;
+        }
+        for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
+                nvgpu_init_list_node(&cache->full[i]);
+                nvgpu_init_list_node(&cache->partial[i]);
+        }
+        cache->mem_tree = NULL;
+        g->mm.pd_cache = cache;
+        nvgpu_mutex_init(&cache->lock);
+        pd_dbg(g, "PD cache initialized!");
+        return 0;
+}
+void nvgpu_pd_cache_fini(struct gk20a *g)
+{
+        int i;
+        struct nvgpu_pd_cache *cache = g->mm.pd_cache;
+        if (!cache)
+                return;
+        for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
+                WARN_ON(!nvgpu_list_empty(&cache->full[i]));
+                WARN_ON(!nvgpu_list_empty(&cache->partial[i]));
+        }
+        nvgpu_kfree(g, g->mm.pd_cache);
+}
+/*
+ * This is the simple pass-through for greater than page or page sized PDs.
+ *
+ * Note: this does not need the cache lock since it does not modify any of the
+ * PD cache data structures.
+ */
+int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+                                  struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+        int err;
+        pd_dbg(g, "PD-Alloc [D] %u bytes", bytes);
+        pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem));
+        if (!pd->mem) {
+                pd_dbg(g, "OOM allocating nvgpu_mem struct!");
+                return -ENOMEM;
+        }
+        err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
+                                    bytes, pd->mem);
+        if (err) {
+                pd_dbg(g, "OOM allocating page directory!");
+                nvgpu_kfree(g, pd->mem);
+                return -ENOMEM;
+        }
+        pd->cached = false;
+        pd->mem_offs = 0;
+        return 0;
+}
+/*
+ * Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed
+ * pd to reflect this allocation.
+ */
+static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
+                                    struct nvgpu_pd_cache *cache,
+                                    struct nvgpu_gmmu_pd *pd,
+                                    u32 bytes)
+{
+        struct nvgpu_pd_mem_entry *pentry;
+        pd_dbg(g, "PD-Alloc [C]   New: offs=0");
+        pentry = nvgpu_kzalloc(g, sizeof(*pentry));
+        if (!pentry) {
+                pd_dbg(g, "OOM allocating pentry!");
+                return -ENOMEM;
+        }
+        if (nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
+                                  PAGE_SIZE, &pentry->mem)) {
+                nvgpu_kfree(g, pentry);
+                pd_dbg(g, "Unable to DMA alloc!");
+                return -ENOMEM;
+        }
+        pentry->pd_size = bytes;
+        nvgpu_list_add(&pentry->list_entry,
+                       &cache->partial[nvgpu_pd_cache_nr(bytes)]);
+        /*
+         * This allocates the very first PD table in the set of tables in this
+         * nvgpu_pd_mem_entry.
+         */
+        pentry->alloc_map = 1;
+        /*
+         * Now update the nvgpu_gmmu_pd to reflect this allocation.
+         */
+        pd->mem = &pentry->mem;
+        pd->mem_offs = 0;
+        pd->cached = true;
+        pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem;
+        nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree);
+        return 0;
+}
+static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
+                                             struct nvgpu_pd_cache *cache,
+                                             struct nvgpu_pd_mem_entry *pentry,
+                                             struct nvgpu_gmmu_pd *pd)
+{
+        unsigned long bit_offs;
+        u32 mem_offs;
+        u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
+        /*
+         * Find and allocate an open PD.
+         */
+        bit_offs = ffz(pentry->alloc_map);
+        mem_offs = bit_offs * pentry->pd_size;
+        /* Bit map full. Somethings wrong. */
+        if (WARN_ON(bit_offs >= ffz(pentry_mask)))
+                return -ENOMEM;
+        pentry->alloc_map |= 1 << bit_offs;
+        pd_dbg(g, "PD-Alloc [C]   Partial: offs=%lu", bit_offs);
+        /*
+         * First update the pd.
+         */
+        pd->mem = &pentry->mem;
+        pd->mem_offs = mem_offs;
+        pd->cached = true;
+        /*
+         * Now make sure the pentry is in the correct list (full vs partial).
+         */
+        if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
+                pd_dbg(g, "Adding pentry to full list!");
+                nvgpu_list_del(&pentry->list_entry);
+                nvgpu_list_add(&pentry->list_entry,
+                        &cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]);
+        }
+        return 0;
+}
+/*
+ * Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial
+ * nvgpu_pd_mem_entry's.
+ */
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial(
+        struct nvgpu_pd_cache *cache, u32 bytes)
+{
+        struct nvgpu_list_node *list =
+                &cache->partial[nvgpu_pd_cache_nr(bytes)];
+        if (nvgpu_list_empty(list))
+                return NULL;
+        return nvgpu_list_first_entry(list,
+                                      nvgpu_pd_mem_entry,
+                                      list_entry);
+}
+/*
+ * Allocate memory from an nvgpu_mem for the page directory.
+ */
+static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
+                                struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+        struct nvgpu_pd_mem_entry *pentry;
+        int err;
+        pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
+        if (bytes & (bytes - 1) ||
+            (bytes >= PAGE_SIZE ||
+             bytes < NVGPU_PD_CACHE_MIN)) {
+                pd_dbg(g, "PD-Alloc [C]   Invalid (bytes=%u)!", bytes);
+                return -EINVAL;
+        }
+        pentry = nvgpu_pd_cache_get_partial(cache, bytes);
+        if (!pentry)
+                err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes);
+        else
+                err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd);
+        if (err)
+                pd_dbg(g, "PD-Alloc [C] Failed!");
+        return err;
+}
+/*
+ * Allocate the DMA memory for a page directory. This handles the necessary PD
+ * cache logistics. Since on Parker and later GPUs some of the page  directories
+ * are smaller than a page packing these PDs together saves a lot of memory.
+ */
+int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        int err;
+        /*
+         * Simple case: PD is bigger than a page so just do a regular DMA
+         * alloc.
+         */
+        if (bytes >= PAGE_SIZE) {
+                err = __nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+                if (err)
+                        return err;
+                return 0;
+        }
+        if (WARN_ON(!g->mm.pd_cache))
+                return -ENOMEM;
+        nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+        err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes);
+        nvgpu_mutex_release(&g->mm.pd_cache->lock);
+        return err;
+}
+void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+{
+        pd_dbg(g, "PD-Free  [D] 0x%p", pd->mem);
+        if (!pd->mem)
+                return;
+        nvgpu_dma_free(g, pd->mem);
+        nvgpu_kfree(g, pd->mem);
+        pd->mem = NULL;
+}
+static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g,
+                                          struct nvgpu_pd_cache *cache,
+                                          struct nvgpu_pd_mem_entry *pentry)
+{
+        nvgpu_dma_free(g, &pentry->mem);
+        nvgpu_list_del(&pentry->list_entry);
+        nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree);
+        nvgpu_kfree(g, pentry);
+}
+static void nvgpu_pd_cache_do_free(struct gk20a *g,
+                                   struct nvgpu_pd_cache *cache,
+                                   struct nvgpu_pd_mem_entry *pentry,
+                                   struct nvgpu_gmmu_pd *pd)
+{
+        u32 index = pd->mem_offs / pentry->pd_size;
+        u32 bit = 1 << index;
+        /* Mark entry as free. */
+        pentry->alloc_map &= ~bit;
+        if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
+                /*
+                 * Partially full still. If it was already on the partial list
+                 * this just re-adds it.
+                 */
+                nvgpu_list_del(&pentry->list_entry);
+                nvgpu_list_add(&pentry->list_entry,
+                        &cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
+        } else {
+                /* Empty now so free it. */
+                nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
+        }
+}
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
+        struct gk20a *g,
+        struct nvgpu_pd_cache *cache,
+        struct nvgpu_gmmu_pd *pd)
+{
+        struct nvgpu_rbtree_node *node;
+        nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node,
+                            cache->mem_tree);
+        if (!node)
+                return NULL;
+        return nvgpu_pd_mem_entry_from_tree_entry(node);
+}
+static void nvgpu_pd_cache_free(struct gk20a *g, struct nvgpu_pd_cache *cache,
+                                struct nvgpu_gmmu_pd *pd)
+{
+        struct nvgpu_pd_mem_entry *pentry;
+        pd_dbg(g, "PD-Free  [C] 0x%p", pd->mem);
+        pentry = nvgpu_pd_cache_look_up(g, cache, pd);
+        if (!pentry) {
+                WARN(1, "Attempting to free non-existent pd");
+                return;
+        }
+        nvgpu_pd_cache_do_free(g, cache, pentry, pd);
+}
+void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        /*
+         * Simple case: just DMA free.
+         */
+        if (!pd->cached)
+                return __nvgpu_pd_cache_free_direct(g, pd);
+        nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+        nvgpu_pd_cache_free(g, g->mm.pd_cache, pd);
+        nvgpu_mutex_release(&g->mm.pd_cache->lock);
+}
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index 3aeba500..3ed3c7fe 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -35,21 +35,42 @@ int vm_aspace_id(struct vm_gk20a *vm)
        return vm->as_share ? vm->as_share->id : -1;
 }
-static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
+static void __nvgpu_vm_free_entries(struct vm_gk20a *vm,
-                                  struct nvgpu_gmmu_pd *parent,
+                                    struct nvgpu_gmmu_pd *pd,
-                                  int level)
+                                    int level)
 {
        int i;
-        if (parent->entries)
+        if (pd->mem) {
-                for (i = 0; i < parent->num_entries; i++)
+                __nvgpu_pd_free(vm, pd);
-                        nvgpu_vm_free_entries(vm, &parent->entries[i],
+                pd->mem = NULL;
+        }
+        if (pd->entries) {
+                for (i = 0; i < pd->num_entries; i++)
+                        __nvgpu_vm_free_entries(vm, &pd->entries[i],
                                              level + 1);
+                nvgpu_vfree(vm->mm->g, pd->entries);
+                pd->entries = NULL;
+        }
+}
+static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
+                                  struct nvgpu_gmmu_pd *pdb)
+{
+        struct gk20a *g = vm->mm->g;
+        int i;
+        __nvgpu_pd_cache_free_direct(g, pdb);
+        if (!pdb->entries)
+                return;
+        for (i = 0; i < pdb->num_entries; i++)
+                __nvgpu_vm_free_entries(vm, &pdb->entries[i], 1);
-        if (parent->mem.size)
+        nvgpu_vfree(g, pdb->entries);
-                nvgpu_free_gmmu_pages(vm, parent);
+        pdb->entries = NULL;
-        nvgpu_vfree(vm->mm->g, parent->entries);
-        parent->entries = NULL;
 }
 u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
@@ -110,7 +131,7 @@ void nvgpu_vm_mapping_batch_finish_locked(
        if (mapping_batch->need_tlb_invalidate) {
                struct gk20a *g = gk20a_from_vm(vm);
-                g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
+                g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
        }
 }
@@ -407,9 +428,8 @@ clean_up_allocators:
        if (nvgpu_alloc_initialized(&vm->user_lp))
                nvgpu_alloc_destroy(&vm->user_lp);
 clean_up_page_tables:
-        /* Cleans up nvgpu_vm_init_page_tables() */
+        /* Cleans up nvgpu_gmmu_init_page_table() */
-        nvgpu_vfree(g, vm->pdb.entries);
+        __nvgpu_pd_cache_free_direct(g, &vm->pdb);
-        nvgpu_free_gmmu_pages(vm, &vm->pdb);
 clean_up_vgpu_vm:
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
        if (g->is_virtual)
@@ -525,7 +545,7 @@ static void __nvgpu_vm_remove(struct vm_gk20a *vm)
        if (nvgpu_alloc_initialized(&vm->user_lp))
                nvgpu_alloc_destroy(&vm->user_lp);
-        nvgpu_vm_free_entries(vm, &vm->pdb, 0);
+        nvgpu_vm_free_entries(vm, &vm->pdb);
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
        if (g->is_virtual)
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 380c28ac..a0753770 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -25,6 +25,7 @@
 #include <nvgpu/soc.h>
 #include <nvgpu/enabled.h>
 #include <nvgpu/pmu.h>
+#include <nvgpu/gmmu.h>
 #include <trace/events/gk20a.h>
@@ -174,6 +175,14 @@ int gk20a_finalize_poweron(struct gk20a *g)
                g->gpu_reset_done = true;
        }
+        /*
+         * Do this early so any early VMs that get made are capable of mapping
+         * buffers.
+         */
+        err = nvgpu_pd_cache_init(g);
+        if (err)
+                return err;
        /* init interface layer support for PMU falcon */
        nvgpu_flcn_sw_init(g, FALCON_ID_PMU);
        nvgpu_flcn_sw_init(g, FALCON_ID_SEC2);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 558a1b06..0a84cabb 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -478,6 +478,7 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm)
        gk20a_semaphore_sea_destroy(g);
        gk20a_vidmem_destroy(g);
+        nvgpu_pd_cache_fini(g);
 }
 static int gk20a_alloc_sysmem_flush(struct gk20a *g)
@@ -1560,7 +1561,7 @@ static inline u32 big_valid_pde0_bits(struct gk20a *g,
                                      struct nvgpu_gmmu_pd *pd, u64 addr)
 {
        u32 pde0_bits =
-                nvgpu_aperture_mask(g, &pd->mem,
+                nvgpu_aperture_mask(g, pd->mem,
                  gmmu_pde_aperture_big_sys_mem_ncoh_f(),
                  gmmu_pde_aperture_big_video_memory_f()) |
                gmmu_pde_address_big_sys_f(
@@ -1573,7 +1574,7 @@ static inline u32 small_valid_pde1_bits(struct gk20a *g,
                                        struct nvgpu_gmmu_pd *pd, u64 addr)
 {
        u32 pde1_bits =
-                nvgpu_aperture_mask(g, &pd->mem,
+                nvgpu_aperture_mask(g, pd->mem,
                  gmmu_pde_aperture_small_sys_mem_ncoh_f(),
                  gmmu_pde_aperture_small_video_memory_f()) |
                gmmu_pde_vol_small_true_f() | /* tbd: why? */
@@ -2173,14 +2174,14 @@ static int gk20a_init_ce_vm(struct mm_gk20a *mm)
 void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
                struct vm_gk20a *vm)
 {
-        u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
+        u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0);
        u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
        u32 pdb_addr_hi = u64_hi32(pdb_addr);
        gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
        nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
-                nvgpu_aperture_mask(g, &vm->pdb.mem,
+                nvgpu_aperture_mask(g, vm->pdb.mem,
                  ram_in_page_dir_base_target_sys_mem_ncoh_f(),
                  ram_in_page_dir_base_target_vid_mem_f()) |
                ram_in_page_dir_base_vol_true_f() |
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index a245d0e0..cadcffa4 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -31,6 +31,8 @@
 #include <nvgpu/rbtree.h>
 #include <nvgpu/kref.h>
+struct nvgpu_pd_cache;
 #ifdef CONFIG_ARM64
 #define outer_flush_range(a, b)
 #define __cpuc_flush_dcache_area __flush_dcache_area
@@ -217,6 +219,8 @@ struct mm_gk20a {
                struct vm_gk20a *vm;
        } ce;
+        struct nvgpu_pd_cache *pd_cache;
        struct nvgpu_mutex l2_op_lock;
        struct nvgpu_mutex tlb_lock;
        struct nvgpu_mutex priv_lock;
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index c3867e9d..2ff199c6 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -164,7 +164,7 @@ static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
        phys_addr >>= gmmu_new_pde_address_shift_v();
-        pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
+        pde_v[0] |= nvgpu_aperture_mask(g, pd->mem,
                        gmmu_new_pde_aperture_sys_mem_ncoh_f(),
                        gmmu_new_pde_aperture_video_memory_f());
        pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
@@ -209,7 +209,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
        if (small_valid) {
                pde_v[2] |=
                        gmmu_new_dual_pde_address_small_sys_f(small_addr);
-                pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem,
+                pde_v[2] |= nvgpu_aperture_mask(g, pd->mem,
                        gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
                        gmmu_new_dual_pde_aperture_small_video_memory_f());
                pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f();
@@ -219,7 +219,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
        if (big_valid) {
                pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr);
                pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f();
-                pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
+                pde_v[0] |= nvgpu_aperture_mask(g, pd->mem,
                        gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
                        gmmu_new_dual_pde_aperture_big_video_memory_f());
                pde_v[1] |= big_addr >> 28;
@@ -365,14 +365,14 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
 static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
                struct vm_gk20a *vm)
 {
-        u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
+        u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0);
        u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
        u32 pdb_addr_hi = u64_hi32(pdb_addr);
        gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
        nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
-                nvgpu_aperture_mask(g, &vm->pdb.mem,
+                nvgpu_aperture_mask(g, vm->pdb.mem,
                  ram_in_page_dir_base_target_sys_mem_ncoh_f(),
                  ram_in_page_dir_base_target_vid_mem_f()) |
                ram_in_page_dir_base_vol_true_f() |
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index 28a2cb82..eff87c31 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -19,6 +19,9 @@
 #include <nvgpu/types.h>
 #include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/rbtree.h>
+#include <nvgpu/lock.h>
 struct scatterlist;
@@ -45,14 +48,85 @@ enum gk20a_mem_rw_flag {
 };
 /*
+ * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
+ * structure is of course depending on this. The MIN_SHIFT define is the right
+ * number of bits to shift to determine which list to use in the array of lists.
+ */
+#define NVGPU_PD_CACHE_MIN              256
+#define NVGPU_PD_CACHE_MIN_SHIFT        9
+#define NVGPU_PD_CACHE_COUNT            4
+struct nvgpu_pd_mem_entry {
+        struct nvgpu_mem                mem;
+        /*
+         * Size of the page directories (not the mem). bmap is a bitmap showing
+         * which PDs have been allocated. The size of mem will always be one
+         * page. pd_size will always be a power of 2.
+         */
+        u32                             pd_size;
+        unsigned long                   alloc_map;
+        struct nvgpu_list_node          list_entry;
+        struct nvgpu_rbtree_node        tree_entry;
+};
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
+{
+        return (struct nvgpu_pd_mem_entry *)
+                ((uintptr_t)node -
+                 offsetof(struct nvgpu_pd_mem_entry, list_entry));
+};
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
+{
+        return (struct nvgpu_pd_mem_entry *)
+                ((uintptr_t)node -
+                 offsetof(struct nvgpu_pd_mem_entry, tree_entry));
+};
+/*
+ * A cache for allocating PD memory from. This enables smaller PDs to be packed
+ * into single pages.
+ *
+ * This is fairly complex so see the documentation in pd_cache.c for a full
+ * description of how this is organized.
+ */
+struct nvgpu_pd_cache {
+        /*
+         * Array of lists of full nvgpu_pd_mem_entries and partially full (or
+         * empty) nvgpu_pd_mem_entries.
+         */
+        struct nvgpu_list_node           full[NVGPU_PD_CACHE_COUNT];
+        struct nvgpu_list_node           partial[NVGPU_PD_CACHE_COUNT];
+        /*
+         * Tree of all allocated struct nvgpu_mem's for fast look up.
+         */
+        struct nvgpu_rbtree_node        *mem_tree;
+        /*
+         * All access to the cache much be locked. This protects the lists and
+         * the rb tree.
+         */
+        struct nvgpu_mutex               lock;
+};
+/*
 * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
 * in the GMMU.
 */
 struct nvgpu_gmmu_pd {
        /*
-         * DMA memory describing the PTEs or PTEs.
+         * DMA memory describing the PTEs or PDEs. @mem_offs describes the
+         * offset of the PDE table in @mem. @cached specifies if this PD is
+         * using pd_cache memory.
         */
-        struct nvgpu_mem         mem;
+        struct nvgpu_mem        *mem;
+        u32                      mem_offs;
+        bool                     cached;
        /*
         * List of pointers to the next level of page tables. Does not
@@ -66,7 +140,7 @@ struct nvgpu_gmmu_pd {
 * Reduce the number of arguments getting passed through the various levels of
 * GMMU mapping functions.
 *
- * The following fields are set statically and do not change throughout
+ * The following fields are set statically and do not change throughout the
 * mapping call:
 *
 *   pgsz:        Index into the page size table.
@@ -166,8 +240,13 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
                      struct nvgpu_mem *mem,
                      u64 gpu_va);
-void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
+int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes);
-                     struct nvgpu_gmmu_pd *entry);
+void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd);
+int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+                                  struct nvgpu_gmmu_pd *pd, u32 bytes);
+void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd);
+int nvgpu_pd_cache_init(struct gk20a *g);
+void nvgpu_pd_cache_fini(struct gk20a *g);
 /*
 * Some useful routines that are shared across chips.
@@ -181,7 +260,7 @@ static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l,
 static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
                            size_t w, size_t data)
 {
-        nvgpu_mem_wr32(g, &pd->mem, w, data);
+        nvgpu_mem_wr32(g, pd->mem, (pd->mem_offs / sizeof(u32)) + w, data);
 }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h
index 3b8e6b19..a1110a59 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/log.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/log.h
@@ -68,6 +68,7 @@ enum nvgpu_log_categories {
        gpu_dbg_xv         = BIT(17),   /* XVE debugging. */
        gpu_dbg_shutdown   = BIT(18),   /* GPU shutdown tracing. */
        gpu_dbg_kmem       = BIT(19),   /* Kmem tracking debugging. */
+        gpu_dbg_pd_cache   = BIT(20),   /* PD cache traces. */
        gpu_dbg_mem        = BIT(31),   /* memory accesses; very verbose. */
 };