9 files changed, 979 insertions, 811 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index 06291600..ec1bc095 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -25,115 +25,26 @@
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
-#define gmmu_dbg(g, fmt, args...)                       \
+#define __gmmu_dbg(g, attrs, fmt, args...)                              \
-        nvgpu_log(g, gpu_dbg_map, fmt, ##args)
+        do {                                                            \
-#define gmmu_dbg_v(g, fmt, args...)                     \
+                if (attrs->debug)                                       \
-        nvgpu_log(g, gpu_dbg_map_v, fmt, ##args)
+                        nvgpu_info(g, fmt, ##args);                     \
+                else                                                    \
-static int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
+                        nvgpu_log(g, gpu_dbg_map, fmt, ##args);         \
-{
+        } while (0)
-        return nvgpu_mem_begin(g, &entry->mem);
-}
+#define __gmmu_dbg_v(g, attrs, fmt, args...)                            \
+        do {                                                            \
-static void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
+                if (attrs->debug)                                       \
-{
+                        nvgpu_info(g, fmt, ##args);                     \
-        nvgpu_mem_end(g, &entry->mem);
+                else                                                    \
-}
+                        nvgpu_log(g, gpu_dbg_map_v, fmt, ##args);       \
+        } while (0)
-static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
-                                  struct gk20a_mm_entry *entry)
+static int pd_allocate(struct vm_gk20a *vm,
-{
+                       struct nvgpu_gmmu_pd *pd,
-        struct gk20a *g = gk20a_from_vm(vm);
+                       const struct gk20a_mmu_level *l,
-        u32 num_pages = 1 << order;
+                       struct nvgpu_gmmu_attrs *attrs);
-        u32 len = num_pages * PAGE_SIZE;
-        int err;
-        err = nvgpu_dma_alloc(g, len, &entry->mem);
-        if (err) {
-                nvgpu_err(g, "memory allocation failed");
-                return -ENOMEM;
-        }
-        return 0;
-}
-void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
-                           struct gk20a_mm_entry *entry)
-{
-        struct gk20a *g = gk20a_from_vm(vm);
-        if (!entry->mem.size)
-                return;
-        if (entry->woffset) /* fake shadow mem */
-                return;
-        nvgpu_dma_free(g, &entry->mem);
-}
-/*
- * Allocate a phys contig region big enough for a full
- * sized gmmu page table for the given gmmu_page_size.
- * the whole range is zeroed so it's "invalid"/will fault.
- *
- * If a previous entry is supplied, its memory will be used for
- * suballocation for this next entry too, if there is space.
- */
-int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm,
-                                 enum gmmu_pgsz_gk20a pgsz_idx,
-                                 const struct gk20a_mmu_level *l,
-                                 struct gk20a_mm_entry *entry,
-                                 struct gk20a_mm_entry *prev_entry)
-{
-        int err = -ENOMEM;
-        int order;
-        struct gk20a *g = gk20a_from_vm(vm);
-        u32 bytes;
-        /* allocate enough pages for the table */
-        order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1;
-        order += ilog2(l->entry_size);
-        bytes = 1 << order;
-        order -= PAGE_SHIFT;
-        if (order < 0 && prev_entry) {
-                /* try to suballocate from previous chunk */
-                u32 capacity = prev_entry->mem.size / bytes;
-                u32 prev = prev_entry->woffset * sizeof(u32) / bytes;
-                u32 free = capacity - prev - 1;
-                nvgpu_log(g, gpu_dbg_pte, "cap %d prev %d free %d bytes %d",
-                                capacity, prev, free, bytes);
-                if (free) {
-                        memcpy(&entry->mem, &prev_entry->mem,
-                                        sizeof(entry->mem));
-                        entry->woffset = prev_entry->woffset
-                                + bytes / sizeof(u32);
-                        err = 0;
-                }
-        }
-        if (err) {
-                /* no suballoc space */
-                order = max(0, order);
-                err = nvgpu_alloc_gmmu_pages(vm, order, entry);
-                entry->woffset = 0;
-        }
-        nvgpu_log(g, gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d, woff %x",
-                  entry,
-                  (entry->mem.priv.sgt &&
-                   entry->mem.aperture == APERTURE_SYSMEM) ?
-                  g->ops.mm.get_iova_addr(g, entry->mem.priv.sgt->sgl, 0) : 0,
-                  order, entry->woffset);
-        if (err)
-                return err;
-        entry->pgsz = pgsz_idx;
-        entry->mem.skip_wmb = true;
-        return err;
-}
 /*
 * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU
@@ -225,103 +136,484 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va)
        nvgpu_mutex_release(&vm->update_gmmu_lock);
 }
-static int update_gmmu_level_locked(struct vm_gk20a *vm,
+int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
-                                    struct gk20a_mm_entry *pte,
+{
-                                    enum gmmu_pgsz_gk20a pgsz_idx,
+        /*
-                                    struct scatterlist **sgl,
+         * Need this just for page size. Everything else can be ignored. Also
-                                    u64 *offset,
+         * note that we can just use pgsz 0 (i.e small pages) since the number
-                                    u64 *iova,
+         * of bits present in the top level PDE are the same for small/large
-                                    u64 gpu_va, u64 gpu_end,
+         * page VMs.
-                                    u8 kind_v, u64 *ctag,
+         */
-                                    bool cacheable, bool unmapped_pte,
+        struct nvgpu_gmmu_attrs attrs = {
-                                    int rw_flag,
+                .pgsz = 0,
-                                    bool sparse,
+        };
-                                    int lvl,
-                                    bool priv,
+        return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs);
-                                    enum nvgpu_aperture aperture)
+}
+/*
+ * Ensure that there's a CPU mapping for the page directory memory. This won't
+ * always be the case for 32 bit systems since we may need to save kernel
+ * virtual memory.
+ */
+static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
+{
+        return nvgpu_mem_begin(g, &entry->mem);
+}
+/*
+ * Handle any necessary CPU unmap semantics for a page directories DMA memory.
+ * For 64 bit platforms this is a noop.
+ */
+static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
+{
+        nvgpu_mem_end(g, &entry->mem);
+}
+static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes,
+                                  struct nvgpu_gmmu_pd *pd)
 {
        struct gk20a *g = gk20a_from_vm(vm);
-        const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl];
+        unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS;
-        const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1];
+        int err;
-        int err = 0;
-        u32 pde_i;
+        /*
-        u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx];
+         * On arm32 vmalloc space is a precious commodity so we do not map pages
-        struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL;
+         * by default.
+         */
+        if (!IS_ENABLED(CONFIG_ARM64))
+                flags |= NVGPU_DMA_NO_KERNEL_MAPPING;
+        err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem);
+        if (err)
+                return -ENOMEM;
+        return 0;
+}
+void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
+                           struct nvgpu_gmmu_pd *pd)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        nvgpu_dma_free(g, &pd->mem);
+}
+/*
+ * Return the _physical_ address of a page directory.
+ */
+u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+{
+        if (g->mm.has_physical_mode)
+                return sg_phys(pd->mem.priv.sgt->sgl);
+        else
+                return nvgpu_mem_get_base_addr(g, &pd->mem, 0);
+}
+/*
+ * Return the aligned length based on the page size in attrs.
+ */
+static u64 nvgpu_align_map_length(struct vm_gk20a *vm, u64 length,
+                                  struct nvgpu_gmmu_attrs *attrs)
+{
+        u64 page_size = vm->gmmu_page_sizes[attrs->pgsz];
+        return ALIGN(length, page_size);
+}
+static u32 pd_entries(const struct gk20a_mmu_level *l,
+                      struct nvgpu_gmmu_attrs *attrs)
+{
+        /*
+         * Number of entries in a PD is easy to compute from the number of bits
+         * used to index the page directory. That is simply 2 raised to the
+         * number of bits.
+         */
+        return 1UL << (l->hi_bit[attrs->pgsz] - l->lo_bit[attrs->pgsz] + 1UL);
+}
+/*
+ * Computes the size of a PD table.
+ */
+static u32 pd_size(const struct gk20a_mmu_level *l,
+                   struct nvgpu_gmmu_attrs *attrs)
+{
+        return pd_entries(l, attrs) * l->entry_size;
+}
+/*
+ * Allocate a physically contiguous region big enough for a gmmu page table
+ * of the specified level and page size. The whole range is zeroed so that any
+ * accesses will fault until proper values are programmed.
+ */
+static int pd_allocate(struct vm_gk20a *vm,
+                       struct nvgpu_gmmu_pd *pd,
+                       const struct gk20a_mmu_level *l,
+                       struct nvgpu_gmmu_attrs *attrs)
+{
+        int err;
-        gk20a_dbg_fn("");
+        if (pd->mem.size)
+                return 0;
-        pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL))
+        err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd);
-                >> (u64)l->lo_bit[pgsz_idx];
+        if (err) {
+                nvgpu_info(vm->mm->g, "error allocating page directory!");
+                return err;
+        }
-        gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx",
+        /*
-                  pgsz_idx, lvl, gpu_va, gpu_end-1, *iova);
+         * One mb() is done after all mapping operations. Don't need individual
+         * barriers for each PD write.
+         */
+        pd->mem.skip_wmb = true;
-        while (gpu_va < gpu_end) {
+        return 0;
-                u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end);
+}
-                /* Allocate next level */
+/*
+ * Compute what page directory index at the passed level the passed virtual
+ * address corresponds to. @attrs is necessary for determining the page size
+ * which is used to pick the right bit offsets for the GMMU level.
+ */
+static u32 pd_index(const struct gk20a_mmu_level *l, u64 virt,
+                    struct nvgpu_gmmu_attrs *attrs)
+{
+        u64 pd_mask = (1ULL << ((u64)l->hi_bit[attrs->pgsz] + 1)) - 1ULL;
+        u32 pd_shift = (u64)l->lo_bit[attrs->pgsz];
+        /*
+         * For convenience we don't bother computing the lower bound of the
+         * mask; it's easier to just shift it off.
+         */
+        return (virt & pd_mask) >> pd_shift;
+}
+static int pd_allocate_children(struct vm_gk20a *vm,
+                                const struct gk20a_mmu_level *l,
+                                struct nvgpu_gmmu_pd *pd,
+                                struct nvgpu_gmmu_attrs *attrs)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        if (pd->entries)
+                return 0;
+        pd->num_entries = pd_entries(l, attrs);
+        pd->entries = nvgpu_vzalloc(g, sizeof(struct nvgpu_gmmu_pd) *
+                                    pd->num_entries);
+        if (!pd->entries)
+                return -ENOMEM;
+        return 0;
+}
+/*
+ * This function programs the GMMU based on two ranges: a physical range and a
+ * GPU virtual range. The virtual is mapped to the physical. Physical in this
+ * case can mean either a real physical sysmem address or a IO virtual address
+ * (for instance when a system has an IOMMU running).
+ *
+ * The rest of the parameters are for describing the actual mapping itself.
+ *
+ * This function recursively calls itself for handling PDEs. At the final level
+ * a PTE handler is called. The phys and virt ranges are adjusted for each
+ * recursion so that each invocation of this function need only worry about the
+ * range it is passed.
+ *
+ * phys_addr will always point to a contiguous range - the discontiguous nature
+ * of DMA buffers is taken care of at the layer above this.
+ */
+static int __set_pd_level(struct vm_gk20a *vm,
+                          struct nvgpu_gmmu_pd *pd,
+                          int lvl,
+                          u64 phys_addr,
+                          u64 virt_addr, u64 length,
+                          struct nvgpu_gmmu_attrs *attrs)
+{
+        int err = 0;
+        u64 pde_range;
+        struct gk20a *g = gk20a_from_vm(vm);
+        struct nvgpu_gmmu_pd *next_pd = NULL;
+        const struct gk20a_mmu_level *l      = &vm->mmu_levels[lvl];
+        const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl + 1];
+        /*
+         * 5 levels for Pascal+. For pre-pascal we only have 2. This puts
+         * offsets into the page table debugging code which makes it easier to
+         * see what level prints are from.
+         */
+        static const char *__lvl_debug[] = {
+                "",          /* L=0 */
+                "  ",        /* L=1 */
+                "    ",      /* L=2 */
+                "      ",    /* L=3 */
+                "        ",  /* L=4 */
+        };
+        pde_range = 1ULL << (u64)l->lo_bit[attrs->pgsz];
+        __gmmu_dbg_v(g, attrs,
+                     "L=%d   %sGPU virt %#-12llx +%#-9llx -> phys %#-12llx",
+                     lvl,
+                     __lvl_debug[lvl],
+                     virt_addr,
+                     length,
+                     phys_addr);
+        /*
+         * Iterate across the mapping in chunks the size of this level's PDE.
+         * For each of those chunks program our level's PDE and then, if there's
+         * a next level, program the next level's PDEs/PTEs.
+         */
+        while (length) {
+                u32 pd_idx = pd_index(l, virt_addr, attrs);
+                u64 chunk_size;
+                u64 target_addr;
+                /*
+                 * Truncate the pde_range when the virtual address does not
+                 * start at a PDE boundary.
+                 */
+                chunk_size = min(length,
+                                 pde_range - (virt_addr & (pde_range - 1)));
+                /*
+                 * If the next level has an update_entry function then we know
+                 * that _this_ level points to PDEs (not PTEs). Thus we need to
+                 * have a bunch of children PDs.
+                 */
                if (next_l->update_entry) {
-                        if (!pte->entries) {
+                        if (pd_allocate_children(vm, l, pd, attrs))
-                                int num_entries =
+                                return -ENOMEM;
-                                        1 <<
-                                         (l->hi_bit[pgsz_idx]
+                        /*
-                                          - l->lo_bit[pgsz_idx] + 1);
+                         * Get the next PD so that we know what to put in this
-                                pte->entries =
+                         * current PD. If the next level is actually PTEs then
-                                        nvgpu_vzalloc(g,
+                         * we don't need this - we will just use the real
-                                                sizeof(struct gk20a_mm_entry) *
+                         * physical target.
-                                                num_entries);
+                         */
-                                if (!pte->entries)
+                        next_pd = &pd->entries[pd_idx];
-                                        return -ENOMEM;
-                                pte->pgsz = pgsz_idx;
+                        /*
-                                pte->num_entries = num_entries;
+                         * Allocate the backing memory for next_pd.
-                        }
+                         */
-                        prev_pte = next_pte;
+                        if (pd_allocate(vm, next_pd, next_l, attrs))
-                        next_pte = pte->entries + pde_i;
+                                return -ENOMEM;
-                        if (!next_pte->mem.size) {
-                                err = nvgpu_zalloc_gmmu_page_table(vm,
-                                        pgsz_idx, next_l, next_pte, prev_pte);
-                                if (err)
-                                        return err;
-                        }
                }
-                err = l->update_entry(vm, pte, pde_i, pgsz_idx,
+                /*
-                                sgl, offset, iova,
+                 * This is the address we want to program into the actual PDE/
-                                kind_v, ctag, cacheable, unmapped_pte,
+                 * PTE. When the next level is PDEs we need the target address
-                                rw_flag, sparse, priv, aperture);
+                 * to be the table of PDEs. When the next level is PTEs the
-                if (err)
+                 * target addr is the real physical address we are aiming for.
-                        return err;
+                 */
+                target_addr = next_pd ? nvgpu_pde_phys_addr(g, next_pd) :
+                                        phys_addr;
+                l->update_entry(vm, l,
+                                pd, pd_idx,
+                                virt_addr,
+                                target_addr,
+                                attrs);
                if (next_l->update_entry) {
-                        /* get cpu access to the ptes */
+                        err = map_gmmu_pages(g, next_pd);
-                        err = map_gmmu_pages(g, next_pte);
                        if (err) {
                                nvgpu_err(g,
-                                           "couldn't map ptes for update as=%d",
+                                          "couldn't map ptes for update as=%d",
-                                           vm_aspace_id(vm));
+                                          vm_aspace_id(vm));
                                return err;
                        }
-                        err = update_gmmu_level_locked(vm, next_pte,
-                                pgsz_idx,
+                        err = __set_pd_level(vm, next_pd,
-                                sgl,
+                                             lvl + 1,
-                                offset,
+                                             phys_addr,
-                                iova,
+                                             virt_addr,
-                                gpu_va,
+                                             chunk_size,
-                                next,
+                                             attrs);
-                                kind_v, ctag, cacheable, unmapped_pte,
+                        unmap_gmmu_pages(g, next_pd);
-                                rw_flag, sparse, lvl+1, priv, aperture);
-                        unmap_gmmu_pages(g, next_pte);
                        if (err)
                                return err;
                }
-                pde_i++;
+                virt_addr += chunk_size;
-                gpu_va = next;
+                /*
+                 * Only add to phys_addr if it's non-zero. A zero value implies
+                 * we are unmapping as as a result we don't want to place
+                 * non-zero phys addresses in the PTEs. A non-zero phys-addr
+                 * would also confuse the lower level PTE programming code.
+                 */
+                if (phys_addr)
+                        phys_addr += chunk_size;
+                length -= chunk_size;
+        }
+        __gmmu_dbg_v(g, attrs, "L=%d   %s%s", lvl, __lvl_debug[lvl], "ret!");
+        return 0;
+}
+/*
+ * VIDMEM version of the update_ptes logic.
+ */
+static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
+                                                 struct sg_table *sgt,
+                                                 u64 space_to_skip,
+                                                 u64 virt_addr,
+                                                 u64 length,
+                                                 struct nvgpu_gmmu_attrs *attrs)
+{
+        struct nvgpu_page_alloc *alloc = NULL;
+        struct page_alloc_chunk *chunk = NULL;
+        u64 phys_addr, chunk_length;
+        int err = 0;
+        if (!sgt) {
+                /*
+                 * This is considered an unmap. Just pass in 0 as the physical
+                 * address for the entire GPU range.
+                 */
+                err = __set_pd_level(vm, &vm->pdb,
+                                     0,
+                                     0,
+                                     virt_addr, length,
+                                     attrs);
+                return err;
+        }
+        alloc = get_vidmem_page_alloc(sgt->sgl);
+        /*
+         * Otherwise iterate across all the chunks in this allocation and
+         * map them.
+         */
+        nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
+                                  page_alloc_chunk, list_entry) {
+                if (space_to_skip &&
+                    space_to_skip >= chunk->length) {
+                        space_to_skip -= chunk->length;
+                        continue;
+                }
+                phys_addr = chunk->base + space_to_skip;
+                chunk_length = min(length, (chunk->length - space_to_skip));
+                err = __set_pd_level(vm, &vm->pdb,
+                                     0,
+                                     phys_addr,
+                                     virt_addr, length,
+                                     attrs);
+                if (err)
+                        break;
+                /* Space has been skipped so zero this for future chunks. */
+                space_to_skip = 0;
+                /*
+                 * Update the map pointer and the remaining length.
+                 */
+                virt_addr += chunk_length;
+                length    -= chunk_length;
+                if (length == 0)
+                        break;
        }
-        gk20a_dbg_fn("done");
+        return err;
+}
+static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
+                                                 struct sg_table *sgt,
+                                                 u64 space_to_skip,
+                                                 u64 virt_addr,
+                                                 u64 length,
+                                                 struct nvgpu_gmmu_attrs *attrs)
+{
+        int err;
+        struct scatterlist *sgl;
+        struct gk20a *g = gk20a_from_vm(vm);
+        if (!sgt) {
+                /*
+                 * This is considered an unmap. Just pass in 0 as the physical
+                 * address for the entire GPU range.
+                 */
+                err = __set_pd_level(vm, &vm->pdb,
+                                     0,
+                                     0,
+                                     virt_addr, length,
+                                     attrs);
+                return err;
+        }
+        /*
+         * At this point we have a Linux scatter-gather list pointing to some
+         * number of discontiguous chunks of memory. Iterate over that list and
+         * generate a GMMU map call for each chunk. There are two possibilities:
+         * either the IOMMU is enabled or not. When the IOMMU is enabled the
+         * mapping is simple since the "physical" address is actually a virtual
+         * IO address and will be contiguous. The no-IOMMU case is more
+         * complicated. We will have to iterate over the SGT and do a separate
+         * map for each chunk of the SGT.
+         */
+        sgl = sgt->sgl;
+        if (!g->mm.bypass_smmu) {
+                u64 io_addr = g->ops.mm.get_iova_addr(g, sgl, 0);
+                io_addr += space_to_skip;
+                err = __set_pd_level(vm, &vm->pdb,
+                                     0,
+                                     io_addr,
+                                     virt_addr,
+                                     length,
+                                     attrs);
+                return err;
+        }
+        /*
+         * Finally: last possible case: do the no-IOMMU mapping. In this case we
+         * really are mapping physical pages directly.
+         */
+        while (sgl) {
+                u64 phys_addr;
+                u64 chunk_length;
+                /*
+                 * Cut out sgl ents for space_to_skip.
+                 */
+                if (space_to_skip && space_to_skip >= sgl->length) {
+                        space_to_skip -= sgl->length;
+                        sgl = sg_next(sgl);
+                        continue;
+                }
+                phys_addr = sg_phys(sgl) + space_to_skip;
+                chunk_length = min(length, sgl->length - space_to_skip);
+                err = __set_pd_level(vm, &vm->pdb,
+                                     0,
+                                     phys_addr,
+                                     virt_addr,
+                                     chunk_length,
+                                     attrs);
+                if (err)
+                        return err;
+                space_to_skip = 0;
+                virt_addr += chunk_length;
+                length    -= chunk_length;
+                sgl        = sg_next(sgl);
+                if (length == 0)
+                        break;
+        }
        return 0;
 }
@@ -332,8 +624,8 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
 * physical* address.
 *
 * The update of each level of the page tables is farmed out to chip specific
- * implementations. But the logic around that is generic to all chips. Every chip
+ * implementations. But the logic around that is generic to all chips. Every
- * has some number of PDE levels and then a PTE level.
+ * chip has some number of PDE levels and then a PTE level.
 *
 * Each chunk of the incoming SGT is sent to the chip specific implementation
 * of page table update.
@@ -341,148 +633,81 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
 * [*] Note: the "physical" address may actually be an IO virtual address in the
 *     case of SMMU usage.
 */
-static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
+static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
-                                   enum gmmu_pgsz_gk20a pgsz_idx,
+                                          struct sg_table *sgt,
-                                   struct sg_table *sgt,
+                                          u64 space_to_skip,
-                                   u64 buffer_offset,
+                                          u64 virt_addr,
-                                   u64 gpu_va, u64 gpu_end,
+                                          u64 length,
-                                   u8 kind_v, u32 ctag_offset,
+                                          struct nvgpu_gmmu_attrs *attrs)
-                                   bool cacheable, bool unmapped_pte,
-                                   int rw_flag,
-                                   bool sparse,
-                                   bool priv,
-                                   enum nvgpu_aperture aperture)
 {
        struct gk20a *g = gk20a_from_vm(vm);
-        int ctag_granularity = g->ops.fb.compression_page_size(g);
+        u32 page_size;
-        u64 ctag = (u64)ctag_offset * (u64)ctag_granularity;
-        u64 iova = 0;
-        u64 space_to_skip = buffer_offset;
-        u64 map_size = gpu_end - gpu_va;
-        u32 page_size  = vm->gmmu_page_sizes[pgsz_idx];
        int err;
-        struct scatterlist *sgl = NULL;
-        struct nvgpu_page_alloc *alloc = NULL;
-        struct page_alloc_chunk *chunk = NULL;
-        u64 length;
        /* note: here we need to map kernel to small, since the
         * low-level mmu code assumes 0 is small and 1 is big pages */
-        if (pgsz_idx == gmmu_page_size_kernel)
+        if (attrs->pgsz == gmmu_page_size_kernel)
-                pgsz_idx = gmmu_page_size_small;
+                attrs->pgsz = gmmu_page_size_small;
+        page_size = vm->gmmu_page_sizes[attrs->pgsz];
        if (space_to_skip & (page_size - 1))
                return -EINVAL;
+        /*
+         * Update length to be aligned to the passed page size.
+         */
+        length = nvgpu_align_map_length(vm, length, attrs);
        err = map_gmmu_pages(g, &vm->pdb);
        if (err) {
-                nvgpu_err(g,
+                nvgpu_err(g, "couldn't map ptes for update as=%d",
-                           "couldn't map ptes for update as=%d",
+                          vm_aspace_id(vm));
-                           vm_aspace_id(vm));
                return err;
        }
-        if (aperture == APERTURE_VIDMEM) {
+        __gmmu_dbg(g, attrs,
-                gmmu_dbg_v(g, "vidmem map size_idx=%d, gpu_va=[%llx,%llx]",
+                   "vm=%s "
-                           pgsz_idx, gpu_va, gpu_end-1);
+                   "%-5s GPU virt %#-12llx +%#-9llx    phys %#-12llx "
+                   "phys offset: %#-4llx;  pgsz: %3dkb perm=%-2s | "
-                if (sgt) {
+                   "kind=%#02x APT=%-6s %c%c%c",
-                        alloc = get_vidmem_page_alloc(sgt->sgl);
+                   vm->name,
+                   sgt ? "MAP" : "UNMAP",
-                        nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
+                   virt_addr,
-                                                 page_alloc_chunk, list_entry) {
+                   length,
-                                if (space_to_skip &&
+                   sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
-                                    space_to_skip > chunk->length) {
+                   space_to_skip,
-                                        space_to_skip -= chunk->length;
+                   page_size >> 10,
-                                } else {
+                   nvgpu_gmmu_perm_str(attrs->rw_flag),
-                                        iova = chunk->base + space_to_skip;
+                   attrs->kind_v,
-                                        length = chunk->length - space_to_skip;
+                   nvgpu_aperture_str(attrs->aperture),
-                                        length = min(length, map_size);
+                   attrs->cacheable ? 'C' : 'V', /* C = cached, V = volatile. */
-                                        space_to_skip = 0;
+                   attrs->sparse    ? 'S' : '-',
+                   attrs->priv      ? 'P' : '-');
-                                        err = update_gmmu_level_locked(vm,
-                                                &vm->pdb, pgsz_idx,
+        /*
-                                                &sgl,
+         * Handle VIDMEM progamming. Currently uses a different scatter list
-                                                &space_to_skip,
+         * format.
-                                                &iova,
+         */
-                                                gpu_va, gpu_va + length,
+        if (attrs->aperture == APERTURE_VIDMEM)
-                                                kind_v, &ctag,
+                err = __nvgpu_gmmu_update_page_table_vidmem(vm,
-                                                cacheable, unmapped_pte,
+                                                            sgt,
-                                                rw_flag, sparse, 0, priv,
+                                                            space_to_skip,
-                                                aperture);
+                                                            virt_addr,
-                                        if (err)
+                                                            length,
-                                                break;
+                                                            attrs);
+        else
-                                        /* need to set explicit zero here */
+                err = __nvgpu_gmmu_update_page_table_sysmem(vm,
-                                        space_to_skip = 0;
+                                                            sgt,
-                                        gpu_va += length;
+                                                            space_to_skip,
-                                        map_size -= length;
+                                                            virt_addr,
+                                                            length,
-                                        if (!map_size)
+                                                            attrs);
-                                                break;
-                                }
-                        }
-                } else {
-                        err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
-                                        &sgl,
-                                        &space_to_skip,
-                                        &iova,
-                                        gpu_va, gpu_end,
-                                        kind_v, &ctag,
-                                        cacheable, unmapped_pte, rw_flag,
-                                        sparse, 0, priv,
-                                        aperture);
-                }
-        } else {
-                gmmu_dbg_v(g,
-                           "pgsz=%-6d, gpu_va: %#-12llx +%#-6llx  phys: %#-12llx "
-                           "buffer offset: %-4lld, nents: %d",
-                           page_size,
-                           gpu_va, gpu_end - gpu_va,
-                           sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
-                           buffer_offset,
-                           sgt ? sgt->nents : 0);
-                if (sgt) {
-                        iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
-                        if (!vm->mm->bypass_smmu && iova) {
-                                iova += space_to_skip;
-                        } else {
-                                sgl = sgt->sgl;
-                                gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
-                                                (u64)sg_phys(sgl),
-                                                sgl->length);
-                                while (space_to_skip && sgl &&
-                                      space_to_skip + page_size > sgl->length) {
-                                        space_to_skip -= sgl->length;
-                                        sgl = sg_next(sgl);
-                                        gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
-                                                        (u64)sg_phys(sgl),
-                                                        sgl->length);
-                                }
-                                iova = sg_phys(sgl) + space_to_skip;
-                        }
-                }
-                err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
-                                &sgl,
-                                &space_to_skip,
-                                &iova,
-                                gpu_va, gpu_end,
-                                kind_v, &ctag,
-                                cacheable, unmapped_pte, rw_flag,
-                                sparse, 0, priv,
-                                aperture);
-        }
        unmap_gmmu_pages(g, &vm->pdb);
        mb();
-        gk20a_dbg_fn("done");
+        __gmmu_dbg(g, attrs, "%-5s Done!", sgt ? "MAP" : "UNMAP");
        return err;
 }
@@ -500,32 +725,44 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 * have the update_gmmu_lock aquired.
 */
 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
-                        u64 map_offset,
+                          u64 vaddr,
-                        struct sg_table *sgt,
+                          struct sg_table *sgt,
-                        u64 buffer_offset,
+                          u64 buffer_offset,
-                        u64 size,
+                          u64 size,
-                        int pgsz_idx,
+                          int pgsz_idx,
-                        u8 kind_v,
+                          u8 kind_v,
-                        u32 ctag_offset,
+                          u32 ctag_offset,
-                        u32 flags,
+                          u32 flags,
-                        int rw_flag,
+                          int rw_flag,
-                        bool clear_ctags,
+                          bool clear_ctags,
-                        bool sparse,
+                          bool sparse,
-                        bool priv,
+                          bool priv,
-                        struct vm_gk20a_mapping_batch *batch,
+                          struct vm_gk20a_mapping_batch *batch,
-                        enum nvgpu_aperture aperture)
+                          enum nvgpu_aperture aperture)
 {
+        struct gk20a *g = gk20a_from_vm(vm);
        int err = 0;
        bool allocated = false;
-        struct gk20a *g = gk20a_from_vm(vm);
        int ctag_granularity = g->ops.fb.compression_page_size(g);
-        u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity);
+        struct nvgpu_gmmu_attrs attrs = {
+                .pgsz      = pgsz_idx,
-        /* Allocate (or validate when map_offset != 0) the virtual address. */
+                .kind_v    = kind_v,
-        if (!map_offset) {
+                .ctag      = (u64)ctag_offset * (u64)ctag_granularity,
-                map_offset = __nvgpu_vm_alloc_va(vm, size,
+                .cacheable = flags & NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
-                                          pgsz_idx);
+                .rw_flag   = rw_flag,
-                if (!map_offset) {
+                .sparse    = sparse,
+                .priv      = priv,
+                .valid     = !(flags & NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE),
+                .aperture  = aperture
+        };
+        /*
+         * Only allocate a new GPU VA range if we haven't already been passed a
+         * GPU VA range. This facilitates fixed mappings.
+         */
+        if (!vaddr) {
+                vaddr = __nvgpu_vm_alloc_va(vm, size, pgsz_idx);
+                if (!vaddr) {
                        nvgpu_err(g, "failed to allocate va space");
                        err = -ENOMEM;
                        goto fail_alloc;
@@ -533,34 +770,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
                allocated = true;
        }
-        gmmu_dbg(g,
+        err = __nvgpu_gmmu_update_page_table(vm, sgt, buffer_offset,
-                 "gv: 0x%04x_%08x + 0x%-7llx "
+                                             vaddr, size, &attrs);
-                 "[dma: 0x%02x_%08x, pa: 0x%02x_%08x] "
-                 "pgsz=%-3dKb as=%-2d ctags=%d start=%d "
-                 "kind=0x%x flags=0x%x apt=%s",
-                 u64_hi32(map_offset), u64_lo32(map_offset), size,
-                 sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0,
-                 sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0,
-                 sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0,
-                 sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0,
-                 vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm),
-                 ctag_lines, ctag_offset,
-                 kind_v, flags, nvgpu_aperture_str(aperture));
-        err = update_gmmu_ptes_locked(vm, pgsz_idx,
-                                      sgt,
-                                      buffer_offset,
-                                      map_offset, map_offset + size,
-                                      kind_v,
-                                      ctag_offset,
-                                      flags &
-                                      NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
-                                      flags &
-                                      NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE,
-                                      rw_flag,
-                                      sparse,
-                                      priv,
-                                      aperture);
        if (err) {
                nvgpu_err(g, "failed to update ptes on map");
                goto fail_validate;
@@ -571,26 +782,37 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
        else
                batch->need_tlb_invalidate = true;
-        return map_offset;
+        return vaddr;
 fail_validate:
        if (allocated)
-                __nvgpu_vm_free_va(vm, map_offset, pgsz_idx);
+                __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
 fail_alloc:
        nvgpu_err(g, "%s: failed with err=%d", __func__, err);
        return 0;
 }
 void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
-                        u64 vaddr,
+                             u64 vaddr,
-                        u64 size,
+                             u64 size,
-                        int pgsz_idx,
+                             int pgsz_idx,
-                        bool va_allocated,
+                             bool va_allocated,
-                        int rw_flag,
+                             int rw_flag,
-                        bool sparse,
+                             bool sparse,
-                        struct vm_gk20a_mapping_batch *batch)
+                             struct vm_gk20a_mapping_batch *batch)
 {
        int err = 0;
        struct gk20a *g = gk20a_from_vm(vm);
+        struct nvgpu_gmmu_attrs attrs = {
+                .pgsz      = pgsz_idx,
+                .kind_v    = 0,
+                .ctag      = 0,
+                .cacheable = 0,
+                .rw_flag   = rw_flag,
+                .sparse    = sparse,
+                .priv      = 0,
+                .valid     = 0,
+                .aperture  = APERTURE_INVALID,
+        };
        if (va_allocated) {
                err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
@@ -601,27 +823,11 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
        }
        /* unmap here needs to know the page size we assigned at mapping */
-        err = update_gmmu_ptes_locked(vm,
+        err = __nvgpu_gmmu_update_page_table(vm, NULL, 0,
-                                pgsz_idx,
+                                             vaddr, size, &attrs);
-                                NULL, /* n/a for unmap */
-                                0,
-                                vaddr,
-                                vaddr + size,
-                                0, 0, false /* n/a for unmap */,
-                                false, rw_flag,
-                                sparse, 0,
-                                APERTURE_INVALID); /* don't care for unmap */
        if (err)
                nvgpu_err(g, "failed to update gmmu ptes on unmap");
-        /* flush l2 so any dirty lines are written out *now*.
-         *  also as we could potentially be switching this buffer
-         * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
-         * some point in the future we need to invalidate l2.  e.g. switching
-         * from a render buffer unmap (here) to later using the same memory
-         * for gmmu ptes.  note the positioning of this relative to any smmu
-         * unmapping (below). */
        if (!batch) {
                gk20a_mm_l2_flush(g, true);
                g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index 88622eca..3aeba500 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -36,7 +36,7 @@ int vm_aspace_id(struct vm_gk20a *vm)
 }
 static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
-                                  struct gk20a_mm_entry *parent,
+                                  struct nvgpu_gmmu_pd *parent,
                                  int level)
 {
        int i;
@@ -75,8 +75,6 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
        /* Be certain we round up to page_size if needed */
        size = (size + ((u64)page_size - 1)) & ~((u64)page_size - 1);
-        nvgpu_log(g, gpu_dbg_map, "size=0x%llx @ pgsz=%dKB", size,
-                  vm->gmmu_page_sizes[pgsz_idx] >> 10);
        addr = nvgpu_alloc(vma, size);
        if (!addr) {
@@ -84,17 +82,14 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
                return 0;
        }
-        nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr);
        return addr;
 }
 int __nvgpu_vm_free_va(struct vm_gk20a *vm, u64 addr,
                       enum gmmu_pgsz_gk20a pgsz_idx)
 {
-        struct gk20a *g = vm->mm->g;
        struct nvgpu_allocator *vma = vm->vma[pgsz_idx];
-        nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr);
        nvgpu_free(vma, addr);
        return 0;
@@ -127,32 +122,6 @@ void nvgpu_vm_mapping_batch_finish(struct vm_gk20a *vm,
        nvgpu_mutex_release(&vm->update_gmmu_lock);
 }
-static int nvgpu_vm_init_page_tables(struct vm_gk20a *vm)
-{
-        u32 pde_lo, pde_hi;
-        int err;
-        pde_range_from_vaddr_range(vm,
-                                   0, vm->va_limit-1,
-                                   &pde_lo, &pde_hi);
-        vm->pdb.entries = nvgpu_vzalloc(vm->mm->g,
-                                        sizeof(struct gk20a_mm_entry) *
-                                        (pde_hi + 1));
-        vm->pdb.num_entries = pde_hi + 1;
-        if (!vm->pdb.entries)
-                return -ENOMEM;
-        err = nvgpu_zalloc_gmmu_page_table(vm, 0, &vm->mmu_levels[0],
-                                           &vm->pdb, NULL);
-        if (err) {
-                nvgpu_vfree(vm->mm->g, vm->pdb.entries);
-                return err;
-        }
-        return 0;
-}
 /*
 * Determine if the passed address space can support big pages or not.
 */
@@ -280,7 +249,8 @@ static int __nvgpu_vm_init(struct mm_gk20a *mm,
 #endif
        /* Initialize the page table data structures. */
-        err = nvgpu_vm_init_page_tables(vm);
+        strncpy(vm->name, name, min(strlen(name), sizeof(vm->name)));
+        err = nvgpu_gmmu_init_page_table(vm);
        if (err)
                goto clean_up_vgpu_vm;
diff --git a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
index 3c76e817..c5f9c1fd 100644
--- a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
@@ -67,7 +67,7 @@ void gk20a_fb_tlb_invalidate(struct gk20a *g, struct nvgpu_mem *pdb)
        if (!g->power_on)
                return;
-        addr_lo = u64_lo32(gk20a_mem_get_base_addr(g, pdb, 0) >> 12);
+        addr_lo = u64_lo32(nvgpu_mem_get_base_addr(g, pdb, 0) >> 12);
        nvgpu_mutex_acquire(&g->mm.tlb_lock);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index b7b68575..558a1b06 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -777,31 +777,6 @@ int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm)
        return vm->mmu_levels[0].lo_bit[0];
 }
-/* given address range (inclusive) determine the pdes crossed */
-void pde_range_from_vaddr_range(struct vm_gk20a *vm,
-                                              u64 addr_lo, u64 addr_hi,
-                                              u32 *pde_lo, u32 *pde_hi)
-{
-        int pde_shift = gk20a_mm_pde_coverage_bit_count(vm);
-        *pde_lo = (u32)(addr_lo >> pde_shift);
-        *pde_hi = (u32)(addr_hi >> pde_shift);
-        gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
-                   addr_lo, addr_hi, pde_shift);
-        gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
-                   *pde_lo, *pde_hi);
-}
-static u32 pde_from_index(u32 i)
-{
-        return i * gmmu_pde__size_v() / sizeof(u32);
-}
-static u32 pte_from_index(u32 i)
-{
-        return i * gmmu_pte__size_v() / sizeof(u32);
-}
 int nvgpu_vm_get_buffers(struct vm_gk20a *vm,
                         struct nvgpu_mapped_buf ***mapped_buffers,
                         int *num_buffers)
@@ -1478,7 +1453,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
 * If mem is in VIDMEM, return base address in vidmem
 * else return IOVA address for SYSMEM
 */
-u64 gk20a_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
+u64 nvgpu_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
                            u32 flags)
 {
        struct nvgpu_page_alloc *alloc;
@@ -1580,203 +1555,168 @@ u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
        return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
 }
-void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry,
-                size_t w, size_t data)
-{
-        nvgpu_mem_wr32(g, &entry->mem, entry->woffset + w, data);
-}
-u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry)
-{
-        u64 base;
-        if (g->mm.has_physical_mode)
-                base = sg_phys(entry->mem.priv.sgt->sgl);
-        else
-                base = gk20a_mem_get_base_addr(g, &entry->mem, 0);
-        return base + entry->woffset * sizeof(u32);
-}
 /* for gk20a the "video memory" apertures here are misnomers. */
 static inline u32 big_valid_pde0_bits(struct gk20a *g,
-                struct gk20a_mm_entry *entry)
+                                      struct nvgpu_gmmu_pd *pd, u64 addr)
 {
-        u64 pte_addr = gk20a_pde_addr(g, entry);
        u32 pde0_bits =
-                nvgpu_aperture_mask(g, &entry->mem,
+                nvgpu_aperture_mask(g, &pd->mem,
                  gmmu_pde_aperture_big_sys_mem_ncoh_f(),
                  gmmu_pde_aperture_big_video_memory_f()) |
                gmmu_pde_address_big_sys_f(
-                           (u32)(pte_addr >> gmmu_pde_address_shift_v()));
+                           (u32)(addr >> gmmu_pde_address_shift_v()));
        return pde0_bits;
 }
 static inline u32 small_valid_pde1_bits(struct gk20a *g,
-                struct gk20a_mm_entry *entry)
+                                        struct nvgpu_gmmu_pd *pd, u64 addr)
 {
-        u64 pte_addr = gk20a_pde_addr(g, entry);
        u32 pde1_bits =
-                nvgpu_aperture_mask(g, &entry->mem,
+                nvgpu_aperture_mask(g, &pd->mem,
                  gmmu_pde_aperture_small_sys_mem_ncoh_f(),
                  gmmu_pde_aperture_small_video_memory_f()) |
                gmmu_pde_vol_small_true_f() | /* tbd: why? */
                gmmu_pde_address_small_sys_f(
-                           (u32)(pte_addr >> gmmu_pde_address_shift_v()));
+                           (u32)(addr >> gmmu_pde_address_shift_v()));
        return pde1_bits;
 }
-/* Given the current state of the ptes associated with a pde,
+static void update_gmmu_pde_locked(struct vm_gk20a *vm,
-   determine value and write it out.  There's no checking
+                                   const struct gk20a_mmu_level *l,
-   here to determine whether or not a change was actually
+                                   struct nvgpu_gmmu_pd *pd,
-   made.  So, superfluous updates will cause unnecessary
+                                   u32 pd_idx,
-   pde invalidations.
+                                   u64 virt_addr,
-*/
+                                   u64 phys_addr,
-static int update_gmmu_pde_locked(struct vm_gk20a *vm,
+                                   struct nvgpu_gmmu_attrs *attrs)
-                           struct gk20a_mm_entry *pte,
-                           u32 i, u32 gmmu_pgsz_idx,
-                           struct scatterlist **sgl,
-                           u64 *offset,
-                           u64 *iova,
-                           u32 kind_v, u64 *ctag,
-                           bool cacheable, bool unammped_pte,
-                           int rw_flag, bool sparse, bool priv,
-                           enum nvgpu_aperture aperture)
 {
        struct gk20a *g = gk20a_from_vm(vm);
        bool small_valid, big_valid;
-        struct gk20a_mm_entry *entry = vm->pdb.entries + i;
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
        u32 pde_v[2] = {0, 0};
-        u32 pde;
-        gk20a_dbg_fn("");
+        small_valid = attrs->pgsz == gmmu_page_size_small;
+        big_valid   = attrs->pgsz == gmmu_page_size_big;
-        small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small;
-        big_valid   = entry->mem.size && entry->pgsz == gmmu_page_size_big;
        pde_v[0] = gmmu_pde_size_full_f();
        pde_v[0] |= big_valid ?
-                big_valid_pde0_bits(g, entry) :
+                big_valid_pde0_bits(g, pd, phys_addr) :
                gmmu_pde_aperture_big_invalid_f();
-        pde_v[1] |= (small_valid ?
+        pde_v[1] |= (small_valid ? small_valid_pde1_bits(g, pd, phys_addr) :
-                     small_valid_pde1_bits(g, entry) :
                     (gmmu_pde_aperture_small_invalid_f() |
                      gmmu_pde_vol_small_false_f()))
-                    |
+                |
-                    (big_valid ? (gmmu_pde_vol_big_true_f()) :
+                (big_valid ? (gmmu_pde_vol_big_true_f()) :
-                     gmmu_pde_vol_big_false_f());
+                 gmmu_pde_vol_big_false_f());
-        pde = pde_from_index(i);
+        pte_dbg(g, attrs,
+                "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
+                "GPU %#-12llx  phys %#-12llx "
+                "[0x%08x, 0x%08x]",
+                pd_idx, l->entry_size, pd_offset,
+                small_valid ? 'S' : '-',
+                big_valid ?   'B' : '-',
+                virt_addr, phys_addr,
+                pde_v[1], pde_v[0]);
-        gk20a_pde_wr32(g, &vm->pdb, pde + 0, pde_v[0]);
+        pd_write(g, &vm->pdb, pd_offset + 0, pde_v[0]);
-        gk20a_pde_wr32(g, &vm->pdb, pde + 1, pde_v[1]);
+        pd_write(g, &vm->pdb, pd_offset + 1, pde_v[1]);
+}
-        gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
+static void __update_pte_sparse(u32 *pte_w)
-                  i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
+{
-        return 0;
+        pte_w[0]  = gmmu_pte_valid_false_f();
+        pte_w[1] |= gmmu_pte_vol_true_f();
 }
-static int update_gmmu_pte_locked(struct vm_gk20a *vm,
+static void __update_pte(struct vm_gk20a *vm,
-                           struct gk20a_mm_entry *pte,
+                         u32 *pte_w,
-                           u32 i, u32 gmmu_pgsz_idx,
+                         u64 phys_addr,
-                           struct scatterlist **sgl,
+                         struct nvgpu_gmmu_attrs *attrs)
-                           u64 *offset,
-                           u64 *iova,
-                           u32 kind_v, u64 *ctag,
-                           bool cacheable, bool unmapped_pte,
-                           int rw_flag, bool sparse, bool priv,
-                           enum nvgpu_aperture aperture)
 {
        struct gk20a *g = gk20a_from_vm(vm);
+        u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
+        u32 pte_valid = attrs->valid ?
+                gmmu_pte_valid_true_f() :
+                gmmu_pte_valid_false_f();
+        u32 phys_shifted = phys_addr >> gmmu_pte_address_shift_v();
+        u32 addr = attrs->aperture == APERTURE_SYSMEM ?
+                gmmu_pte_address_sys_f(phys_shifted) :
+                gmmu_pte_address_vid_f(phys_shifted);
        int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
-        u32 page_size  = vm->gmmu_page_sizes[gmmu_pgsz_idx];
-        u32 pte_w[2] = {0, 0}; /* invalid pte */
-        if (*iova) {
-                u32 pte_valid = unmapped_pte ?
-                        gmmu_pte_valid_false_f() :
-                        gmmu_pte_valid_true_f();
-                u32 iova_v = *iova >> gmmu_pte_address_shift_v();
-                u32 pte_addr = aperture == APERTURE_SYSMEM ?
-                                gmmu_pte_address_sys_f(iova_v) :
-                                gmmu_pte_address_vid_f(iova_v);
-                pte_w[0] = pte_valid | pte_addr;
-                if (priv)
-                        pte_w[0] |= gmmu_pte_privilege_true_f();
-                pte_w[1] = __nvgpu_aperture_mask(g, aperture,
-                          gmmu_pte_aperture_sys_mem_ncoh_f(),
-                          gmmu_pte_aperture_video_memory_f()) |
-                        gmmu_pte_kind_f(kind_v) |
-                        gmmu_pte_comptagline_f((u32)(*ctag >> ctag_shift));
-                if (*ctag && vm->mm->use_full_comp_tag_line && *iova & 0x10000)
-                        pte_w[1] |= gmmu_pte_comptagline_f(
-                                        1 << (gmmu_pte_comptagline_s() - 1));
-                if (rw_flag == gk20a_mem_flag_read_only) {
-                        pte_w[0] |= gmmu_pte_read_only_true_f();
-                        pte_w[1] |=
-                                gmmu_pte_write_disable_true_f();
-                } else if (rw_flag ==
-                           gk20a_mem_flag_write_only) {
-                        pte_w[1] |=
-                                gmmu_pte_read_disable_true_f();
-                }
-                if (!unmapped_pte) {
-                        if (!cacheable)
-                                pte_w[1] |=
-                                        gmmu_pte_vol_true_f();
-                } else {
-                        /* Store cacheable value behind
-                         * gmmu_pte_write_disable_true_f */
-                        if (!cacheable)
-                                pte_w[1] |=
-                                gmmu_pte_write_disable_true_f();
-                }
-                gk20a_dbg(gpu_dbg_pte,
+        pte_w[0] = pte_valid | addr;
-                        "pte=%d iova=0x%llx kind=%d ctag=%d vol=%d [0x%08x, 0x%08x]",
-                           i, *iova,
-                           kind_v, (u32)(*ctag >> ctag_shift), !cacheable,
-                           pte_w[1], pte_w[0]);
-                if (*ctag)
+        if (attrs->priv)
-                        *ctag += page_size;
+                pte_w[0] |= gmmu_pte_privilege_true_f();
-        } else if (sparse) {
-                pte_w[0] = gmmu_pte_valid_false_f();
-                pte_w[1] |= gmmu_pte_vol_true_f();
-        } else {
-                gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
-        }
-        gk20a_pde_wr32(g, pte, pte_from_index(i) + 0, pte_w[0]);
+        pte_w[1] = __nvgpu_aperture_mask(g, attrs->aperture,
-        gk20a_pde_wr32(g, pte, pte_from_index(i) + 1, pte_w[1]);
+                                         gmmu_pte_aperture_sys_mem_ncoh_f(),
+                                         gmmu_pte_aperture_video_memory_f()) |
-        if (*iova) {
+                gmmu_pte_kind_f(attrs->kind_v) |
-                *iova += page_size;
+                gmmu_pte_comptagline_f((u32)(attrs->ctag >> ctag_shift));
-                *offset += page_size;
-                if (*sgl && *offset + page_size > (*sgl)->length) {
+        if (attrs->ctag && vm->mm->use_full_comp_tag_line &&
-                        u64 new_iova;
+            phys_addr & 0x10000)
-                        *sgl = sg_next(*sgl);
+                pte_w[1] |= gmmu_pte_comptagline_f(
-                        if (*sgl) {
+                        1 << (gmmu_pte_comptagline_s() - 1));
-                                new_iova = sg_phys(*sgl);
-                                gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
+        if (attrs->rw_flag == gk20a_mem_flag_read_only) {
-                                          new_iova, (*sgl)->length);
+                pte_w[0] |= gmmu_pte_read_only_true_f();
-                                if (new_iova) {
+                pte_w[1] |= gmmu_pte_write_disable_true_f();
-                                        *offset = 0;
+        } else if (attrs->rw_flag == gk20a_mem_flag_write_only) {
-                                        *iova = new_iova;
+                pte_w[1] |= gmmu_pte_read_disable_true_f();
-                                }
-                        }
-                }
        }
-        return 0;
+        if (!attrs->cacheable)
+                pte_w[1] |= gmmu_pte_vol_true_f();
+        if (attrs->ctag)
+                attrs->ctag += page_size;
+}
+static void update_gmmu_pte_locked(struct vm_gk20a *vm,
+                                   const struct gk20a_mmu_level *l,
+                                   struct nvgpu_gmmu_pd *pd,
+                                   u32 pd_idx,
+                                   u64 virt_addr,
+                                   u64 phys_addr,
+                                   struct nvgpu_gmmu_attrs *attrs)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        u32 page_size  = vm->gmmu_page_sizes[attrs->pgsz];
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
+        u32 pte_w[2] = {0, 0};
+        int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
+        if (phys_addr)
+                __update_pte(vm, pte_w, phys_addr, attrs);
+        else if (attrs->sparse)
+                __update_pte_sparse(pte_w);
+        pte_dbg(g, attrs,
+                "PTE: i=%-4u size=%-2u offs=%-4u | "
+                "GPU %#-12llx  phys %#-12llx "
+                "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
+                "ctag=0x%08x "
+                "[0x%08x, 0x%08x]",
+                pd_idx, l->entry_size, pd_offset,
+                virt_addr, phys_addr,
+                page_size >> 10,
+                nvgpu_gmmu_perm_str(attrs->rw_flag),
+                attrs->kind_v,
+                nvgpu_aperture_str(attrs->aperture),
+                attrs->valid     ? 'V' : '-',
+                attrs->cacheable ? 'C' : '-',
+                attrs->sparse    ? 'S' : '-',
+                attrs->priv      ? 'P' : '-',
+                (u32)attrs->ctag >> ctag_shift,
+                pte_w[1], pte_w[0]);
+        pd_write(g, pd, pd_offset + 0, pte_w[0]);
+        pd_write(g, pd, pd_offset + 1, pte_w[1]);
 }
 /* NOTE! mapped_buffers lock must be held */
@@ -1809,13 +1749,6 @@ void nvgpu_vm_unmap_locked(struct nvgpu_mapped_buf *mapped_buffer,
                  mapped_buffer->vm_area->sparse : false,
                batch);
-        gk20a_dbg(gpu_dbg_map,
-                  "gv: 0x%04x_%08x pgsz=%-3dKb as=%-2d own_mem_ref=%d",
-                  u64_hi32(mapped_buffer->addr), u64_lo32(mapped_buffer->addr),
-                  vm->gmmu_page_sizes[mapped_buffer->pgsz_idx] >> 10,
-                  vm_aspace_id(vm),
-                  mapped_buffer->own_mem_ref);
        gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf,
                       mapped_buffer->sgt);
@@ -1942,6 +1875,9 @@ int __gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch)
        if (err)
                ch->vm = NULL;
+        nvgpu_log(gk20a_from_vm(vm), gpu_dbg_map, "Binding ch=%d -> VM:%s",
+                  ch->chid, vm->name);
        return err;
 }
@@ -2114,7 +2050,7 @@ u64 gk20a_mm_inst_block_addr(struct gk20a *g, struct nvgpu_mem *inst_block)
        if (g->mm.has_physical_mode)
                addr = gk20a_mem_phys(inst_block);
        else
-                addr = gk20a_mem_get_base_addr(g, inst_block, 0);
+                addr = nvgpu_mem_get_base_addr(g, inst_block, 0);
        return addr;
 }
@@ -2237,7 +2173,7 @@ static int gk20a_init_ce_vm(struct mm_gk20a *mm)
 void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
                struct vm_gk20a *vm)
 {
-        u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0);
+        u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
        u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
        u32 pdb_addr_hi = u64_hi32(pdb_addr);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index cf37640d..a245d0e0 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -42,12 +42,6 @@
                outer_flush_range(pa, pa + (size_t)(size));             \
        } while (0)
-enum gk20a_mem_rw_flag {
-        gk20a_mem_flag_none = 0,
-        gk20a_mem_flag_read_only = 1,
-        gk20a_mem_flag_write_only = 2,
-};
 struct gpfifo_desc {
        struct nvgpu_mem mem;
        u32 entry_num;
@@ -347,7 +341,7 @@ int gk20a_mm_suspend(struct gk20a *g);
 u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
                u32 flags);
 u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova);
-u64 gk20a_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
+u64 nvgpu_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
                            u32 flags);
 void gk20a_mm_ltc_isr(struct gk20a *g);
@@ -371,10 +365,6 @@ static inline phys_addr_t gk20a_mem_phys(struct nvgpu_mem *mem)
        return 0;
 }
-void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry,
-                size_t w, size_t data);
-u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry);
 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
                        u64 map_offset,
                        struct sg_table *sgt,
@@ -451,8 +441,4 @@ int gk20a_mm_get_buffer_info(struct device *dev, int dmabuf_fd,
                             u64 *buffer_id, u64 *buffer_len);
 void gk20a_vm_unmap_locked_kref(struct kref *ref);
-void gk20a_vm_free_entries(struct vm_gk20a *vm,
-                           struct gk20a_mm_entry *parent,
-                           int level);
 #endif /* MM_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index d7391c6d..c3867e9d 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -14,6 +14,7 @@
 */
 #include <nvgpu/dma.h>
+#include <nvgpu/gmmu.h>
 #include "gk20a/gk20a.h"
 #include "gk20a/platform_gk20a.h"
@@ -149,206 +150,186 @@ static u64 gp10b_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
        return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
 }
-static u32 pde3_from_index(u32 i)
+static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
-{
+                                    const struct gk20a_mmu_level *l,
-        return i * gmmu_new_pde__size_v() / sizeof(u32);
+                                    struct nvgpu_gmmu_pd *pd,
-}
+                                    u32 pd_idx,
+                                    u64 virt_addr,
-static u32 pte3_from_index(u32 i)
+                                    u64 phys_addr,
-{
+                                    struct nvgpu_gmmu_attrs *attrs)
-        return i * gmmu_new_pte__size_v() / sizeof(u32);
-}
-static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
-                           struct gk20a_mm_entry *parent,
-                           u32 i, u32 gmmu_pgsz_idx,
-                           struct scatterlist **sgl,
-                           u64 *offset,
-                           u64 *iova,
-                           u32 kind_v, u64 *ctag,
-                           bool cacheable, bool unmapped_pte,
-                           int rw_flag, bool sparse, bool priv,
-                           enum nvgpu_aperture aperture)
 {
        struct gk20a *g = gk20a_from_vm(vm);
-        u64 pte_addr = 0;
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
-        struct gk20a_mm_entry *pte = parent->entries + i;
        u32 pde_v[2] = {0, 0};
-        u32 pde;
-        gk20a_dbg_fn("");
-        pte_addr = gk20a_pde_addr(g, pte) >> gmmu_new_pde_address_shift_v();
+        phys_addr >>= gmmu_new_pde_address_shift_v();
-        pde_v[0] |= nvgpu_aperture_mask(g, &pte->mem,
+        pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
                        gmmu_new_pde_aperture_sys_mem_ncoh_f(),
                        gmmu_new_pde_aperture_video_memory_f());
-        pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(pte_addr));
+        pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
        pde_v[0] |= gmmu_new_pde_vol_true_f();
-        pde_v[1] |= pte_addr >> 24;
+        pde_v[1] |= phys_addr >> 24;
-        pde = pde3_from_index(i);
+        pd_write(g, pd, pd_offset + 0, pde_v[0]);
-        gk20a_pde_wr32(g, parent, pde + 0, pde_v[0]);
+        pd_write(g, pd, pd_offset + 1, pde_v[1]);
-        gk20a_pde_wr32(g, parent, pde + 1, pde_v[1]);
+        pte_dbg(g, attrs,
-        gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
+                "PDE: i=%-4u size=%-2u offs=%-4u pgsz: -- | "
-                  i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
+                "GPU %#-12llx  phys %#-12llx "
-        gk20a_dbg_fn("done");
+                "[0x%08x, 0x%08x]",
-        return 0;
+                pd_idx, l->entry_size, pd_offset,
+                virt_addr, phys_addr,
+                pde_v[1], pde_v[0]);
 }
-static u32 pde0_from_index(u32 i)
+static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
-{
+                                    const struct gk20a_mmu_level *l,
-        return i * gmmu_new_dual_pde__size_v() / sizeof(u32);
+                                    struct nvgpu_gmmu_pd *pd,
-}
+                                    u32 pd_idx,
+                                    u64 virt_addr,
-static int update_gmmu_pde0_locked(struct vm_gk20a *vm,
+                                    u64 phys_addr,
-                           struct gk20a_mm_entry *pte,
+                                    struct nvgpu_gmmu_attrs *attrs)
-                           u32 i, u32 gmmu_pgsz_idx,
-                           struct scatterlist **sgl,
-                           u64 *offset,
-                           u64 *iova,
-                           u32 kind_v, u64 *ctag,
-                           bool cacheable, bool unmapped_pte,
-                           int rw_flag, bool sparse, bool priv,
-                           enum nvgpu_aperture aperture)
 {
        struct gk20a *g = gk20a_from_vm(vm);
        bool small_valid, big_valid;
-        u32 pte_addr_small = 0, pte_addr_big = 0;
+        u32 small_addr = 0, big_addr = 0;
-        struct gk20a_mm_entry *entry = pte->entries + i;
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
        u32 pde_v[4] = {0, 0, 0, 0};
-        u32 pde;
-        gk20a_dbg_fn("");
-        small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small;
+        small_valid = attrs->pgsz == gmmu_page_size_small;
-        big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big;
+        big_valid   = attrs->pgsz == gmmu_page_size_big;
-        if (small_valid) {
+        if (small_valid)
-                pte_addr_small = gk20a_pde_addr(g, entry)
+                small_addr = phys_addr >> gmmu_new_dual_pde_address_shift_v();
-                                 >> gmmu_new_dual_pde_address_shift_v();
-        }
        if (big_valid)
-                pte_addr_big = gk20a_pde_addr(g, entry)
+                big_addr = phys_addr >> gmmu_new_dual_pde_address_big_shift_v();
-                               >> gmmu_new_dual_pde_address_big_shift_v();
        if (small_valid) {
-                pde_v[2] |= gmmu_new_dual_pde_address_small_sys_f(pte_addr_small);
+                pde_v[2] |=
-                pde_v[2] |= nvgpu_aperture_mask(g, &entry->mem,
+                        gmmu_new_dual_pde_address_small_sys_f(small_addr);
+                pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem,
                        gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
                        gmmu_new_dual_pde_aperture_small_video_memory_f());
                pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f();
-                pde_v[3] |= pte_addr_small >> 24;
+                pde_v[3] |= small_addr >> 24;
        }
        if (big_valid) {
-                pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(pte_addr_big);
+                pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr);
                pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f();
-                pde_v[0] |= nvgpu_aperture_mask(g, &entry->mem,
+                pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
                        gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
                        gmmu_new_dual_pde_aperture_big_video_memory_f());
-                pde_v[1] |= pte_addr_big >> 28;
+                pde_v[1] |= big_addr >> 28;
        }
-        pde = pde0_from_index(i);
+        pd_write(g, pd, pd_offset + 0, pde_v[0]);
+        pd_write(g, pd, pd_offset + 1, pde_v[1]);
-        gk20a_pde_wr32(g, pte, pde + 0, pde_v[0]);
+        pd_write(g, pd, pd_offset + 2, pde_v[2]);
-        gk20a_pde_wr32(g, pte, pde + 1, pde_v[1]);
+        pd_write(g, pd, pd_offset + 3, pde_v[3]);
-        gk20a_pde_wr32(g, pte, pde + 2, pde_v[2]);
-        gk20a_pde_wr32(g, pte, pde + 3, pde_v[3]);
+        pte_dbg(g, attrs,
+                "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
-        gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]",
+                "GPU %#-12llx  phys %#-12llx "
-                  i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
+                "[0x%08x, 0x%08x, 0x%08x, 0x%08x]",
-        gk20a_dbg_fn("done");
+                pd_idx, l->entry_size, pd_offset,
-        return 0;
+                small_valid ? 'S' : '-',
+                big_valid ?   'B' : '-',
+                virt_addr, phys_addr,
+                pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
 }
-static int update_gmmu_pte_locked(struct vm_gk20a *vm,
+static void __update_pte(struct vm_gk20a *vm,
-                           struct gk20a_mm_entry *pte,
+                         u32 *pte_w,
-                           u32 i, u32 gmmu_pgsz_idx,
+                         u64 phys_addr,
-                           struct scatterlist **sgl,
+                         struct nvgpu_gmmu_attrs *attrs)
-                           u64 *offset,
-                           u64 *iova,
-                           u32 kind_v, u64 *ctag,
-                           bool cacheable, bool unmapped_pte,
-                           int rw_flag, bool sparse, bool priv,
-                           enum nvgpu_aperture aperture)
 {
-        struct gk20a *g = vm->mm->g;
+        struct gk20a *g = gk20a_from_vm(vm);
-        u32 page_size  = vm->gmmu_page_sizes[gmmu_pgsz_idx];
        u64 ctag_granularity = g->ops.fb.compression_page_size(g);
-        u32 pte_w[2] = {0, 0}; /* invalid pte */
+        u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
-        u32 pte_i;
+        u32 pte_valid = attrs->valid ?
+                gmmu_new_pte_valid_true_f() :
-        if (*iova) {
+                gmmu_new_pte_valid_false_f();
-                u32 pte_valid = unmapped_pte ?
+        u32 phys_shifted = phys_addr >> gmmu_new_pte_address_shift_v();
-                        gmmu_new_pte_valid_false_f() :
+        u32 pte_addr = attrs->aperture == APERTURE_SYSMEM ?
-                        gmmu_new_pte_valid_true_f();
+                gmmu_new_pte_address_sys_f(phys_shifted) :
-                u32 iova_v = *iova >> gmmu_new_pte_address_shift_v();
+                gmmu_new_pte_address_vid_f(phys_shifted);
-                u32 pte_addr = aperture == APERTURE_SYSMEM ?
+        u32 pte_tgt = __nvgpu_aperture_mask(g, attrs->aperture,
-                                gmmu_new_pte_address_sys_f(iova_v) :
+                gmmu_new_pte_aperture_sys_mem_ncoh_f(),
-                                gmmu_new_pte_address_vid_f(iova_v);
+                gmmu_new_pte_aperture_video_memory_f());
-                u32 pte_tgt = __nvgpu_aperture_mask(g, aperture,
-                                gmmu_new_pte_aperture_sys_mem_ncoh_f(),
+        pte_w[0] = pte_valid | pte_addr | pte_tgt;
-                                gmmu_new_pte_aperture_video_memory_f());
+        if (attrs->priv)
-                pte_w[0] = pte_valid | pte_addr | pte_tgt;
+                pte_w[0] |= gmmu_new_pte_privilege_true_f();
-                if (priv)
+        pte_w[1] = phys_addr >> (24 + gmmu_new_pte_address_shift_v()) |
-                        pte_w[0] |= gmmu_new_pte_privilege_true_f();
+                gmmu_new_pte_kind_f(attrs->kind_v) |
+                gmmu_new_pte_comptagline_f((u32)(attrs->ctag /
-                pte_w[1] = *iova >> (24 + gmmu_new_pte_address_shift_v()) |
+                                                 ctag_granularity));
-                           gmmu_new_pte_kind_f(kind_v) |
-                           gmmu_new_pte_comptagline_f((u32)(*ctag / ctag_granularity));
+        if (attrs->rw_flag == gk20a_mem_flag_read_only)
+                pte_w[0] |= gmmu_new_pte_read_only_true_f();
-                if (rw_flag == gk20a_mem_flag_read_only)
-                        pte_w[0] |= gmmu_new_pte_read_only_true_f();
+        if (!attrs->valid && !attrs->cacheable)
-                if (unmapped_pte && !cacheable)
+                pte_w[0] |= gmmu_new_pte_read_only_true_f();
-                        pte_w[0] |= gmmu_new_pte_read_only_true_f();
+        else if (!attrs->cacheable)
-                else if (!cacheable)
-                        pte_w[0] |= gmmu_new_pte_vol_true_f();
-                gk20a_dbg(gpu_dbg_pte, "pte=%d iova=0x%llx kind=%d"
-                           " ctag=%d vol=%d"
-                           " [0x%08x, 0x%08x]",
-                           i, *iova,
-                           kind_v, (u32)(*ctag / ctag_granularity), !cacheable,
-                           pte_w[1], pte_w[0]);
-                if (*ctag)
-                        *ctag += page_size;
-        } else if (sparse) {
-                pte_w[0] = gmmu_new_pte_valid_false_f();
                pte_w[0] |= gmmu_new_pte_vol_true_f();
-        } else {
-                gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
-        }
-        pte_i = pte3_from_index(i);
+        if (attrs->ctag)
+                attrs->ctag += page_size;
-        gk20a_pde_wr32(g, pte, pte_i + 0, pte_w[0]);
-        gk20a_pde_wr32(g, pte, pte_i + 1, pte_w[1]);
+}
-        if (*iova) {
+static void __update_pte_sparse(u32 *pte_w)
-                *iova += page_size;
+{
-                *offset += page_size;
+        pte_w[0] = gmmu_new_pte_valid_false_f();
-                if (*sgl && *offset + page_size > (*sgl)->length) {
+        pte_w[0] |= gmmu_new_pte_vol_true_f();
-                        u64 new_iova;
+}
-                        *sgl = sg_next(*sgl);
-                        if (*sgl) {
+static void update_gmmu_pte_locked(struct vm_gk20a *vm,
-                                new_iova = sg_phys(*sgl);
+                                   const struct gk20a_mmu_level *l,
-                                gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
+                                   struct nvgpu_gmmu_pd *pd,
-                                          new_iova, (*sgl)->length);
+                                   u32 pd_idx,
-                                if (new_iova) {
+                                   u64 virt_addr,
-                                        *offset = 0;
+                                   u64 phys_addr,
-                                        *iova = new_iova;
+                                   struct nvgpu_gmmu_attrs *attrs)
-                                }
+{
-                        }
+        struct gk20a *g = vm->mm->g;
-                }
+        u32 page_size  = vm->gmmu_page_sizes[attrs->pgsz];
-        }
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
-        return 0;
+        u32 pte_w[2] = {0, 0};
+        if (phys_addr)
+                __update_pte(vm, pte_w, phys_addr, attrs);
+        else if (attrs->sparse)
+                __update_pte_sparse(pte_w);
+        pte_dbg(g, attrs,
+                "vm=%s "
+                "PTE: i=%-4u size=%-2u offs=%-4u | "
+                "GPU %#-12llx  phys %#-12llx "
+                "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
+                "ctag=0x%08x "
+                "[0x%08x, 0x%08x]",
+                vm->name,
+                pd_idx, l->entry_size, pd_offset,
+                virt_addr, phys_addr,
+                page_size >> 10,
+                nvgpu_gmmu_perm_str(attrs->rw_flag),
+                attrs->kind_v,
+                nvgpu_aperture_str(attrs->aperture),
+                attrs->valid     ? 'V' : '-',
+                attrs->cacheable ? 'C' : '-',
+                attrs->sparse    ? 'S' : '-',
+                attrs->priv      ? 'P' : '-',
+                (u32)attrs->ctag / g->ops.fb.compression_page_size(g),
+                pte_w[1], pte_w[0]);
+        pd_write(g, pd, pd_offset + 0, pte_w[0]);
+        pd_write(g, pd, pd_offset + 1, pte_w[1]);
 }
 static const struct gk20a_mmu_level gp10b_mm_levels[] = {
@@ -384,7 +365,7 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
 static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
                struct vm_gk20a *vm)
 {
-        u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0);
+        u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
        u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
        u32 pdb_addr_hi = u64_hi32(pdb_addr);
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index ed152cd8..28a2cb82 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -38,36 +38,97 @@ enum gmmu_pgsz_gk20a {
        gmmu_nr_page_sizes    = 3,
 };
-struct gk20a_mm_entry {
+enum gk20a_mem_rw_flag {
-        /* backing for */
+        gk20a_mem_flag_none = 0,        /* RW */
-        struct nvgpu_mem mem;
+        gk20a_mem_flag_read_only = 1,   /* RO */
-        u32 woffset; /* if >0, mem is a shadow copy, owned by another entry */
+        gk20a_mem_flag_write_only = 2,  /* WO */
-        int pgsz;
+};
-        struct gk20a_mm_entry *entries;
-        int num_entries;
+/*
+ * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
+ * in the GMMU.
+ */
+struct nvgpu_gmmu_pd {
+        /*
+         * DMA memory describing the PTEs or PTEs.
+         */
+        struct nvgpu_mem         mem;
+        /*
+         * List of pointers to the next level of page tables. Does not
+         * need to be populated when this PD is pointing to PTEs.
+         */
+        struct nvgpu_gmmu_pd    *entries;
+        int                      num_entries;
+};
+/*
+ * Reduce the number of arguments getting passed through the various levels of
+ * GMMU mapping functions.
+ *
+ * The following fields are set statically and do not change throughout
+ * mapping call:
+ *
+ *   pgsz:        Index into the page size table.
+ *   kind_v:      Kind attributes for mapping.
+ *   cacheable:   Cacheability of the mapping.
+ *   rw_flag:     Flag from enum gk20a_mem_rw_flag
+ *   sparse:      Set if the mapping should be sparse.
+ *   priv:        Privilidged mapping.
+ *   valid:       Set if the PTE should be marked valid.
+ *   aperture:    VIDMEM or SYSMEM.
+ *   debug:       When set print debugging info.
+ *
+ * These fields are dynamically updated as necessary during the map:
+ *
+ *   ctag:        Comptag line in the comptag cache;
+ *                updated every time we write a PTE.
+ */
+struct nvgpu_gmmu_attrs {
+        u32                      pgsz;
+        u32                      kind_v;
+        u64                      ctag;
+        bool                     cacheable;
+        int                      rw_flag;
+        bool                     sparse;
+        bool                     priv;
+        bool                     valid;
+        enum nvgpu_aperture      aperture;
+        bool                     debug;
 };
 struct gk20a_mmu_level {
        int hi_bit[2];
        int lo_bit[2];
-        int (*update_entry)(struct vm_gk20a *vm,
-                           struct gk20a_mm_entry *pte,
+        /*
-                           u32 i, u32 gmmu_pgsz_idx,
+         * Build map from virt_addr -> phys_addr.
-                           struct scatterlist **sgl,
+         */
-                           u64 *offset,
+        void (*update_entry)(struct vm_gk20a *vm,
-                           u64 *iova,
+                             const struct gk20a_mmu_level *l,
-                           u32 kind_v, u64 *ctag,
+                             struct nvgpu_gmmu_pd *pd,
-                           bool cacheable, bool unmapped_pte,
+                             u32 pd_idx,
-                           int rw_flag, bool sparse, bool priv,
+                             u64 phys_addr,
-                           enum nvgpu_aperture aperture);
+                             u64 virt_addr,
-        size_t entry_size;
+                             struct nvgpu_gmmu_attrs *attrs);
+        u32 entry_size;
 };
-int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm,
+static inline const char *nvgpu_gmmu_perm_str(enum gk20a_mem_rw_flag p)
-                                 enum gmmu_pgsz_gk20a pgsz_idx,
+{
-                                 const struct gk20a_mmu_level *l,
+        switch (p) {
-                                 struct gk20a_mm_entry *entry,
+        case gk20a_mem_flag_none:
-                                 struct gk20a_mm_entry *prev_entry);
+                return "RW";
+        case gk20a_mem_flag_write_only:
+                return "WO";
+        case gk20a_mem_flag_read_only:
+                return "RO";
+        default:
+                return "??";
+        }
+}
+int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm);
 /**
 * nvgpu_gmmu_map - Map memory into the GMMU.
@@ -106,6 +167,33 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
                      u64 gpu_va);
 void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
-                     struct gk20a_mm_entry *entry);
+                     struct nvgpu_gmmu_pd *entry);
+/*
+ * Some useful routines that are shared across chips.
+ */
+static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l,
+                                       u32 pd_idx)
+{
+        return (pd_idx * l->entry_size) / sizeof(u32);
+}
+static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
+                            size_t w, size_t data)
+{
+        nvgpu_mem_wr32(g, &pd->mem, w, data);
+}
+/*
+ * Internal debugging routines. Probably not something you want to use.
+ */
+#define pte_dbg(g, attrs, fmt, args...)                                 \
+        do {                                                            \
+                if (attrs && attrs->debug)                              \
+                        nvgpu_info(g, fmt, ##args);                     \
+                else                                                    \
+                        nvgpu_log(g, gpu_dbg_pte, fmt, ##args);         \
+        } while (0)
 #endif
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
index 66d04ab8..4259d40f 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
@@ -109,9 +109,9 @@ nvgpu_mem_from_clear_list_entry(struct nvgpu_list_node *node)
 static inline const char *nvgpu_aperture_str(enum nvgpu_aperture aperture)
 {
        switch (aperture) {
-                case APERTURE_INVALID: return "invalid";
+                case APERTURE_INVALID: return "INVAL";
-                case APERTURE_SYSMEM:  return "sysmem";
+                case APERTURE_SYSMEM:  return "SYSMEM";
-                case APERTURE_VIDMEM:  return "vidmem";
+                case APERTURE_VIDMEM:  return "VIDMEM";
        };
        return "UNKNOWN";
 }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vm.h b/drivers/gpu/nvgpu/include/nvgpu/vm.h
index f6d88cc3..255b4361 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vm.h
@@ -126,6 +126,7 @@ mapped_buffer_from_rbtree_node(struct nvgpu_rbtree_node *node)
 struct vm_gk20a {
        struct mm_gk20a *mm;
        struct gk20a_as_share *as_share; /* as_share this represents */
+        char name[20];
        u64 va_start;
        u64 va_limit;
@@ -145,7 +146,7 @@ struct vm_gk20a {
        struct nvgpu_mutex update_gmmu_lock;
-        struct gk20a_mm_entry pdb;
+        struct nvgpu_gmmu_pd pdb;
        /*
         * These structs define the address spaces. In some cases it's possible