20 files changed, 474 insertions, 241 deletions
diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index d02870fb..6e475fcb 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -55,6 +55,7 @@ nvgpu-y := \
        common/mm/pd_cache.o \
        common/mm/vm.o \
        common/mm/vm_area.o \
+        common/mm/nvgpu_mem.o \
        common/bus.o \
        common/enabled.o \
        common/pramin.o \
diff --git a/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c b/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
index e4991d0d..eb54f3fd 100644
--- a/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
+++ b/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
@@ -21,6 +21,7 @@
 #include <nvgpu/log.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/kmem.h>
 #include <nvgpu/linux/dma.h>
@@ -395,3 +396,116 @@ int __nvgpu_mem_create_from_pages(struct gk20a *g, struct nvgpu_mem *dest,
        return 0;
 }
+static struct nvgpu_mem_sgl *__nvgpu_mem_sgl_dup(struct gk20a *g,
+                                                 struct nvgpu_mem_sgl *sgl)
+{
+        struct nvgpu_mem_sgl *head, *next;
+        head = nvgpu_kzalloc(g, sizeof(*sgl));
+        if (!head)
+                return NULL;
+        next = head;
+        while (true) {
+                nvgpu_log(g, gpu_dbg_sgl,
+                          "  phys: 0x%-12llx dma: 0x%-12llx len: 0x%llx",
+                          sgl->phys, sgl->dma, sgl->length);
+                next->dma    = sgl->dma;
+                next->phys   = sgl->phys;
+                next->length = sgl->length;
+                next->next   = NULL;
+                sgl = nvgpu_mem_sgl_next(sgl);
+                if (!sgl)
+                        break;
+                next->next = nvgpu_kzalloc(g, sizeof(*sgl));
+                if (!next->next) {
+                        nvgpu_mem_sgl_free(g, head);
+                        return NULL;
+                }
+                next = next->next;
+        }
+        return head;
+}
+static struct nvgpu_mem_sgl *__nvgpu_mem_sgl_create_from_vidmem(
+        struct gk20a *g,
+        struct scatterlist *linux_sgl)
+{
+        struct nvgpu_page_alloc *vidmem_alloc;
+        vidmem_alloc = get_vidmem_page_alloc(linux_sgl);
+        if (!vidmem_alloc)
+                return NULL;
+        nvgpu_log(g, gpu_dbg_sgl, "Vidmem sgl:");
+        return __nvgpu_mem_sgl_dup(g, vidmem_alloc->sgl);
+}
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create(struct gk20a *g,
+                                           struct sg_table *sgt)
+{
+        struct nvgpu_mem_sgl *head, *sgl, *next;
+        struct scatterlist *linux_sgl = sgt->sgl;
+        if (is_vidmem_page_alloc(sg_dma_address(linux_sgl)))
+                return __nvgpu_mem_sgl_create_from_vidmem(g, linux_sgl);
+        head = nvgpu_kzalloc(g, sizeof(*sgl));
+        if (!head)
+                return NULL;
+        nvgpu_log(g, gpu_dbg_sgl, "Making sgl:");
+        sgl = head;
+        while (true) {
+                sgl->dma    = sg_dma_address(linux_sgl);
+                sgl->phys   = sg_phys(linux_sgl);
+                sgl->length = linux_sgl->length;
+                /*
+                 * We don't like offsets in the pages here. This will cause
+                 * problems.
+                 */
+                if (WARN_ON(linux_sgl->offset)) {
+                        nvgpu_mem_sgl_free(g, head);
+                        return NULL;
+                }
+                nvgpu_log(g, gpu_dbg_sgl,
+                          "  phys: 0x%-12llx dma: 0x%-12llx len: 0x%llx",
+                          sgl->phys, sgl->dma, sgl->length);
+                /*
+                 * When there's no more SGL ents for the Linux SGL we are
+                 * done. Don't bother making any more SGL ents for the nvgpu
+                 * SGL.
+                 */
+                linux_sgl = sg_next(linux_sgl);
+                if (!linux_sgl)
+                        break;
+                next = nvgpu_kzalloc(g, sizeof(*sgl));
+                if (!next) {
+                        nvgpu_mem_sgl_free(g, head);
+                        return NULL;
+                }
+                sgl->next = next;
+                sgl = next;
+        }
+        nvgpu_log(g, gpu_dbg_sgl, "Done!");
+        return head;
+}
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create_from_mem(struct gk20a *g,
+                                                    struct nvgpu_mem *mem)
+{
+        return nvgpu_mem_sgl_create(g, mem->priv.sgt);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/vm.c b/drivers/gpu/nvgpu/common/linux/vm.c
index 86d8bec9..4a4429dc 100644
--- a/drivers/gpu/nvgpu/common/linux/vm.c
+++ b/drivers/gpu/nvgpu/common/linux/vm.c
@@ -21,8 +21,11 @@
 #include <nvgpu/lock.h>
 #include <nvgpu/rbtree.h>
 #include <nvgpu/vm_area.h>
+#include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/page_allocator.h>
+#include <nvgpu/linux/nvgpu_mem.h>
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
 #include "gk20a/kind_gk20a.h"
@@ -66,17 +69,19 @@ static u64 nvgpu_get_buffer_alignment(struct gk20a *g, struct scatterlist *sgl,
        if (aperture == APERTURE_VIDMEM) {
                struct nvgpu_page_alloc *alloc = get_vidmem_page_alloc(sgl);
-                struct page_alloc_chunk *chunk = NULL;
+                struct nvgpu_mem_sgl *sgl_vid = alloc->sgl;
-                nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
+                while (sgl_vid) {
-                                        page_alloc_chunk, list_entry) {
+                        chunk_align = 1ULL <<
-                        chunk_align = 1ULL << __ffs(chunk->base |
+                                __ffs(nvgpu_mem_sgl_phys(sgl_vid) |
-                                                    chunk->length);
+                                nvgpu_mem_sgl_length(sgl_vid));
                        if (align)
                                align = min(align, chunk_align);
                        else
                                align = chunk_align;
+                        sgl_vid = nvgpu_mem_sgl_next(sgl_vid);
                }
                return align;
@@ -237,6 +242,7 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
        struct nvgpu_vm_area *vm_area = NULL;
        u32 ctag_offset;
        enum nvgpu_aperture aperture;
+        struct nvgpu_mem_sgl *nvgpu_sgl;
        /*
         * The kind used as part of the key for map caching. HW may
@@ -393,9 +399,12 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
                ctag_offset += buffer_offset >>
                               ilog2(g->ops.fb.compression_page_size(g));
+        nvgpu_sgl = nvgpu_mem_sgl_create(g, bfr.sgt);
        /* update gmmu ptes */
-        map_offset = g->ops.mm.gmmu_map(vm, map_offset,
+        map_offset = g->ops.mm.gmmu_map(vm,
-                                        bfr.sgt,
+                                        map_offset,
+                                        nvgpu_sgl,
                                        buffer_offset, /* sg offset */
                                        mapping_size,
                                        bfr.pgsz_idx,
@@ -410,6 +419,8 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
        if (!map_offset)
                goto clean_up;
+        nvgpu_mem_sgl_free(g, nvgpu_sgl);
        mapped_buffer = nvgpu_kzalloc(g, sizeof(*mapped_buffer));
        if (!mapped_buffer) {
                nvgpu_warn(g, "oom allocating tracking buffer");
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index 7f486d68..41f5acdd 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -65,11 +65,14 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
        struct gk20a *g = gk20a_from_vm(vm);
        u64 vaddr;
-        struct sg_table *sgt = mem->priv.sgt;
+        struct nvgpu_mem_sgl *sgl = nvgpu_mem_sgl_create_from_mem(g, mem);
+        if (!sgl)
+                return -ENOMEM;
        nvgpu_mutex_acquire(&vm->update_gmmu_lock);
        vaddr = g->ops.mm.gmmu_map(vm, addr,
-                                   sgt,    /* sg table */
+                                   sgl,    /* sg list */
                                   0,      /* sg offset */
                                   size,
                                   gmmu_page_size_kernel,
@@ -82,8 +85,11 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
                                   NULL,   /* mapping_batch handle */
                                   aperture);
        nvgpu_mutex_release(&vm->update_gmmu_lock);
+        nvgpu_mem_sgl_free(g, sgl);
        if (!vaddr) {
-                nvgpu_err(g, "failed to allocate va space");
+                nvgpu_err(g, "failed to map buffer!");
                return 0;
        }
@@ -91,7 +97,7 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
 }
 /*
- * Convenience wrapper over __nvgpu_gmmu_map() for non-fixed mappings.
+ * Map a nvgpu_mem into the GMMU. This is for kernel space to use.
 */
 u64 nvgpu_gmmu_map(struct vm_gk20a *vm,
                   struct nvgpu_mem *mem,
@@ -106,7 +112,7 @@ u64 nvgpu_gmmu_map(struct vm_gk20a *vm,
 }
 /*
- * Like nvgpu_gmmu_map() except it can work on a fixed address instead.
+ * Like nvgpu_gmmu_map() except this can work on a fixed address.
 */
 u64 nvgpu_gmmu_map_fixed(struct vm_gk20a *vm,
                         struct nvgpu_mem *mem,
@@ -407,7 +413,7 @@ static int __set_pd_level(struct vm_gk20a *vm,
                 */
                target_addr = next_pd ?
                        nvgpu_pde_phys_addr(g, next_pd) :
-                        g->ops.mm.gpu_phys_addr(g, attrs, phys_addr);
+                        phys_addr;
                l->update_entry(vm, l,
                                pd, pd_idx,
@@ -458,18 +464,16 @@ static int __set_pd_level(struct vm_gk20a *vm,
 * VIDMEM version of the update_ptes logic.
 */
 static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
-                                                 struct sg_table *sgt,
+                                                 struct nvgpu_mem_sgl *sgl,
                                                 u64 space_to_skip,
                                                 u64 virt_addr,
                                                 u64 length,
                                                 struct nvgpu_gmmu_attrs *attrs)
 {
-        struct nvgpu_page_alloc *alloc = NULL;
-        struct page_alloc_chunk *chunk = NULL;
        u64 phys_addr, chunk_length;
        int err = 0;
-        if (!sgt) {
+        if (!sgl) {
                /*
                 * This is considered an unmap. Just pass in 0 as the physical
                 * address for the entire GPU range.
@@ -482,22 +486,21 @@ static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
                return err;
        }
-        alloc = get_vidmem_page_alloc(sgt->sgl);
        /*
         * Otherwise iterate across all the chunks in this allocation and
         * map them.
         */
-        nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
+        while (sgl) {
-                                  page_alloc_chunk, list_entry) {
                if (space_to_skip &&
-                    space_to_skip >= chunk->length) {
+                    space_to_skip >= nvgpu_mem_sgl_length(sgl)) {
-                        space_to_skip -= chunk->length;
+                        space_to_skip -= nvgpu_mem_sgl_length(sgl);
+                        sgl = nvgpu_mem_sgl_next(sgl);
                        continue;
                }
-                phys_addr = chunk->base + space_to_skip;
+                phys_addr = nvgpu_mem_sgl_phys(sgl) + space_to_skip;
-                chunk_length = min(length, (chunk->length - space_to_skip));
+                chunk_length = min(length, (nvgpu_mem_sgl_length(sgl) -
+                                            space_to_skip));
                err = __set_pd_level(vm, &vm->pdb,
                                     0,
@@ -518,23 +521,24 @@ static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
                if (length == 0)
                        break;
+                sgl = nvgpu_mem_sgl_next(sgl);
        }
        return err;
 }
 static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
-                                                 struct sg_table *sgt,
+                                                 struct nvgpu_mem_sgl *sgl,
                                                 u64 space_to_skip,
                                                 u64 virt_addr,
                                                 u64 length,
                                                 struct nvgpu_gmmu_attrs *attrs)
 {
        int err;
-        struct scatterlist *sgl;
        struct gk20a *g = gk20a_from_vm(vm);
-        if (!sgt) {
+        if (!sgl) {
                /*
                 * This is considered an unmap. Just pass in 0 as the physical
                 * address for the entire GPU range.
@@ -548,19 +552,15 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
        }
        /*
-         * At this point we have a Linux scatter-gather list pointing to some
+         * At this point we have a scatter-gather list pointing to some number
-         * number of discontiguous chunks of memory. Iterate over that list and
+         * of discontiguous chunks of memory. We must iterate over that list and
         * generate a GMMU map call for each chunk. There are two possibilities:
-         * either the IOMMU is enabled or not. When the IOMMU is enabled the
+         * either an IOMMU is enabled or not. When an IOMMU is enabled the
         * mapping is simple since the "physical" address is actually a virtual
-         * IO address and will be contiguous. The no-IOMMU case is more
+         * IO address and will be contiguous.
-         * complicated. We will have to iterate over the SGT and do a separate
-         * map for each chunk of the SGT.
         */
-        sgl = sgt->sgl;
        if (!g->mm.bypass_smmu) {
-                u64 io_addr = nvgpu_mem_get_addr_sgl(g, sgl);
+                u64 io_addr = nvgpu_mem_sgl_gpu_addr(g, sgl, attrs);
                io_addr += space_to_skip;
@@ -585,14 +585,16 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
                /*
                 * Cut out sgl ents for space_to_skip.
                 */
-                if (space_to_skip && space_to_skip >= sgl->length) {
+                if (space_to_skip &&
-                        space_to_skip -= sgl->length;
+                    space_to_skip >= nvgpu_mem_sgl_length(sgl)) {
-                        sgl = sg_next(sgl);
+                        space_to_skip -= nvgpu_mem_sgl_length(sgl);
+                        sgl = nvgpu_mem_sgl_next(sgl);
                        continue;
                }
-                phys_addr = sg_phys(sgl) + space_to_skip;
+                phys_addr = nvgpu_mem_sgl_phys(sgl) + space_to_skip;
-                chunk_length = min(length, sgl->length - space_to_skip);
+                chunk_length = min(length,
+                                   nvgpu_mem_sgl_length(sgl) - space_to_skip);
                err = __set_pd_level(vm, &vm->pdb,
                                     0,
@@ -600,13 +602,11 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
                                     virt_addr,
                                     chunk_length,
                                     attrs);
-                if (err)
-                        return err;
                space_to_skip = 0;
                virt_addr += chunk_length;
                length    -= chunk_length;
-                sgl        = sg_next(sgl);
+                sgl        = nvgpu_mem_sgl_next(sgl);
                if (length == 0)
                        break;
@@ -624,22 +624,20 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
 * implementations. But the logic around that is generic to all chips. Every
 * chip has some number of PDE levels and then a PTE level.
 *
- * Each chunk of the incoming SGT is sent to the chip specific implementation
+ * Each chunk of the incoming SGL is sent to the chip specific implementation
 * of page table update.
 *
 * [*] Note: the "physical" address may actually be an IO virtual address in the
 *     case of SMMU usage.
 */
 static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
-                                          struct sg_table *sgt,
+                                          struct nvgpu_mem_sgl *sgl,
                                          u64 space_to_skip,
                                          u64 virt_addr,
                                          u64 length,
                                          struct nvgpu_gmmu_attrs *attrs)
 {
        struct gk20a *g = gk20a_from_vm(vm);
-        struct nvgpu_page_alloc *alloc;
-        u64 phys_addr = 0;
        u32 page_size;
        int err;
@@ -665,25 +663,16 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
                return err;
        }
-        if (sgt) {
-                if (attrs->aperture == APERTURE_VIDMEM) {
-                        alloc = get_vidmem_page_alloc(sgt->sgl);
-                        phys_addr = alloc->base;
-                } else
-                        phys_addr = nvgpu_mem_get_addr_sgl(g, sgt->sgl);
-        }
        __gmmu_dbg(g, attrs,
                   "vm=%s "
                   "%-5s GPU virt %#-12llx +%#-9llx    phys %#-12llx "
                   "phys offset: %#-4llx;  pgsz: %3dkb perm=%-2s | "
                   "kind=%#02x APT=%-6s %c%c%c%c%c",
                   vm->name,
-                   sgt ? "MAP" : "UNMAP",
+                   sgl ? "MAP" : "UNMAP",
                   virt_addr,
                   length,
-                   phys_addr,
+                   sgl ? nvgpu_mem_sgl_phys(sgl) : 0,
                   space_to_skip,
                   page_size >> 10,
                   nvgpu_gmmu_perm_str(attrs->rw_flag),
@@ -696,19 +685,19 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
                   attrs->valid     ? 'V' : '-');
        /*
-         * Handle VIDMEM progamming. Currently uses a different scatter list
+         * For historical reasons these are separate, but soon these will be
-         * format.
+         * unified.
         */
        if (attrs->aperture == APERTURE_VIDMEM)
                err = __nvgpu_gmmu_update_page_table_vidmem(vm,
-                                                            sgt,
+                                                            sgl,
                                                            space_to_skip,
                                                            virt_addr,
                                                            length,
                                                            attrs);
        else
                err = __nvgpu_gmmu_update_page_table_sysmem(vm,
-                                                            sgt,
+                                                            sgl,
                                                            space_to_skip,
                                                            virt_addr,
                                                            length,
@@ -717,7 +706,7 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
        unmap_gmmu_pages(g, &vm->pdb);
        nvgpu_smp_mb();
-        __gmmu_dbg(g, attrs, "%-5s Done!", sgt ? "MAP" : "UNMAP");
+        __gmmu_dbg(g, attrs, "%-5s Done!", sgl ? "MAP" : "UNMAP");
        return err;
 }
@@ -736,7 +725,7 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
 */
 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
                          u64 vaddr,
-                          struct sg_table *sgt,
+                          struct nvgpu_mem_sgl *sgl,
                          u64 buffer_offset,
                          u64 size,
                          int pgsz_idx,
@@ -785,7 +774,7 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
                allocated = true;
        }
-        err = __nvgpu_gmmu_update_page_table(vm, sgt, buffer_offset,
+        err = __nvgpu_gmmu_update_page_table(vm, sgl, buffer_offset,
                                             vaddr, size, &attrs);
        if (err) {
                nvgpu_err(g, "failed to update ptes on map");
diff --git a/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c b/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c
new file mode 100644
index 00000000..7296c673
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <nvgpu/kmem.h>
+#include <nvgpu/nvgpu_mem.h>
+#include "gk20a/gk20a.h"
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_next(struct nvgpu_mem_sgl *sgl)
+{
+        return sgl->next;
+}
+u64 nvgpu_mem_sgl_phys(struct nvgpu_mem_sgl *sgl)
+{
+        return sgl->phys;
+}
+u64 nvgpu_mem_sgl_dma(struct nvgpu_mem_sgl *sgl)
+{
+        return sgl->dma;
+}
+u64 nvgpu_mem_sgl_length(struct nvgpu_mem_sgl *sgl)
+{
+        return sgl->length;
+}
+/*
+ * This builds a GPU address for the %sgl based on whether an IOMMU is present
+ * or not. It also handles turning the physical address into the true GPU
+ * physical address that should be programmed into the page tables.
+ */
+u64 nvgpu_mem_sgl_gpu_addr(struct gk20a *g, struct nvgpu_mem_sgl *sgl,
+                           struct nvgpu_gmmu_attrs *attrs)
+{
+        if (nvgpu_mem_sgl_dma(sgl) == 0)
+                return g->ops.mm.gpu_phys_addr(g, attrs,
+                                               nvgpu_mem_sgl_phys(sgl));
+        if (nvgpu_mem_sgl_dma(sgl) == DMA_ERROR_CODE)
+                return 0;
+        return gk20a_mm_smmu_vaddr_translate(g, nvgpu_mem_sgl_dma(sgl));
+}
+void nvgpu_mem_sgl_free(struct gk20a *g, struct nvgpu_mem_sgl *sgl)
+{
+        struct nvgpu_mem_sgl *next;
+        /*
+         * Free each of the elements. We expect each element to have been
+         * nvgpu_k[mz]alloc()ed.
+         */
+        while (sgl) {
+                next = nvgpu_mem_sgl_next(sgl);
+                nvgpu_kfree(g, sgl);
+                sgl = next;
+        }
+}
diff --git a/drivers/gpu/nvgpu/common/mm/page_allocator.c b/drivers/gpu/nvgpu/common/mm/page_allocator.c
index 72ff8f2d..6d92b457 100644
--- a/drivers/gpu/nvgpu/common/mm/page_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/page_allocator.c
@@ -147,19 +147,16 @@ static void __nvgpu_free_pages(struct nvgpu_page_allocator *a,
                               struct nvgpu_page_alloc *alloc,
                               bool free_buddy_alloc)
 {
-        struct page_alloc_chunk *chunk;
+        struct nvgpu_mem_sgl *sgl = alloc->sgl;
-        while (!nvgpu_list_empty(&alloc->alloc_chunks)) {
+        if (free_buddy_alloc) {
-                chunk = nvgpu_list_first_entry(&alloc->alloc_chunks,
+                while (sgl) {
-                                         page_alloc_chunk,
+                        nvgpu_free(&a->source_allocator, sgl->phys);
-                                         list_entry);
+                        sgl = nvgpu_mem_sgl_next(sgl);
-                nvgpu_list_del(&chunk->list_entry);
+                }
-                if (free_buddy_alloc)
-                        nvgpu_free(&a->source_allocator, chunk->base);
-                nvgpu_kmem_cache_free(a->chunk_cache, chunk);
        }
+        nvgpu_mem_sgl_free(a->owner->g, alloc->sgl);
        nvgpu_kmem_cache_free(a->alloc_cache, alloc);
 }
@@ -243,15 +240,14 @@ static void free_slab_page(struct nvgpu_page_allocator *a,
 }
 /*
- * This expects @alloc to have 1 empty page_alloc_chunk already added to the
+ * This expects @alloc to have 1 empty sgl_entry ready for usage.
- * alloc_chunks list.
 */
 static int __do_slab_alloc(struct nvgpu_page_allocator *a,
                           struct page_alloc_slab *slab,
                           struct nvgpu_page_alloc *alloc)
 {
        struct page_alloc_slab_page *slab_page = NULL;
-        struct page_alloc_chunk *chunk;
+        struct nvgpu_mem_sgl *sgl;
        unsigned long offs;
        /*
@@ -302,18 +298,19 @@ static int __do_slab_alloc(struct nvgpu_page_allocator *a,
                BUG(); /* Should be impossible to hit this. */
        /*
-         * Handle building the nvgpu_page_alloc struct. We expect one
+         * Handle building the nvgpu_page_alloc struct. We expect one sgl
-         * page_alloc_chunk to be present.
+         * to be present.
         */
        alloc->slab_page = slab_page;
        alloc->nr_chunks = 1;
        alloc->length = slab_page->slab_size;
        alloc->base = slab_page->page_addr + (offs * slab_page->slab_size);
-        chunk = nvgpu_list_first_entry(&alloc->alloc_chunks,
+        sgl         = alloc->sgl;
-                                page_alloc_chunk, list_entry);
+        sgl->phys   = alloc->base;
-        chunk->base = alloc->base;
+        sgl->dma    = alloc->base;
-        chunk->length = alloc->length;
+        sgl->length = alloc->length;
+        sgl->next   = NULL;
        return 0;
 }
@@ -327,7 +324,7 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
        int err, slab_nr;
        struct page_alloc_slab *slab;
        struct nvgpu_page_alloc *alloc = NULL;
-        struct page_alloc_chunk *chunk = NULL;
+        struct nvgpu_mem_sgl *sgl = NULL;
        /*
         * Align the length to a page and then divide by the page size (4k for
@@ -341,15 +338,13 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
                palloc_dbg(a, "OOM: could not alloc page_alloc struct!\n");
                goto fail;
        }
-        chunk = nvgpu_kmem_cache_alloc(a->chunk_cache);
+        sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
-        if (!chunk) {
+        if (!sgl) {
-                palloc_dbg(a, "OOM: could not alloc alloc_chunk struct!\n");
+                palloc_dbg(a, "OOM: could not alloc sgl struct!\n");
                goto fail;
        }
-        nvgpu_init_list_node(&alloc->alloc_chunks);
+        alloc->sgl = sgl;
-        nvgpu_list_add(&chunk->list_entry, &alloc->alloc_chunks);
        err = __do_slab_alloc(a, slab, alloc);
        if (err)
                goto fail;
@@ -363,8 +358,8 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
 fail:
        if (alloc)
                nvgpu_kmem_cache_free(a->alloc_cache, alloc);
-        if (chunk)
+        if (sgl)
-                nvgpu_kmem_cache_free(a->chunk_cache, chunk);
+                nvgpu_kfree(a->owner->g, sgl);
        return NULL;
 }
@@ -426,7 +421,7 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
        struct nvgpu_page_allocator *a, u64 pages)
 {
        struct nvgpu_page_alloc *alloc;
-        struct page_alloc_chunk *c;
+        struct nvgpu_mem_sgl *sgl, *prev_sgl = NULL;
        u64 max_chunk_len = pages << a->page_shift;
        int i = 0;
@@ -436,7 +431,6 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
        memset(alloc, 0, sizeof(*alloc));
-        nvgpu_init_list_node(&alloc->alloc_chunks);
        alloc->length = pages << a->page_shift;
        while (pages) {
@@ -482,36 +476,48 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
                        goto fail_cleanup;
                }
-                c = nvgpu_kmem_cache_alloc(a->chunk_cache);
+                sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
-                if (!c) {
+                if (!sgl) {
                        nvgpu_free(&a->source_allocator, chunk_addr);
                        goto fail_cleanup;
                }
                pages -= chunk_pages;
-                c->base = chunk_addr;
+                sgl->phys   = chunk_addr;
-                c->length = chunk_len;
+                sgl->dma    = chunk_addr;
-                nvgpu_list_add(&c->list_entry, &alloc->alloc_chunks);
+                sgl->length = chunk_len;
+                /*
+                 * Build the singly linked list with a head node that is part of
+                 * the list.
+                 */
+                if (prev_sgl)
+                        prev_sgl->next = sgl;
+                else
+                        alloc->sgl = sgl;
+                prev_sgl = sgl;
                i++;
        }
        alloc->nr_chunks = i;
-        c = nvgpu_list_first_entry(&alloc->alloc_chunks,
+        alloc->base = alloc->sgl->phys;
-                                page_alloc_chunk, list_entry);
-        alloc->base = c->base;
        return alloc;
 fail_cleanup:
-        while (!nvgpu_list_empty(&alloc->alloc_chunks)) {
+        sgl = alloc->sgl;
-                c = nvgpu_list_first_entry(&alloc->alloc_chunks,
+        while (sgl) {
-                                     page_alloc_chunk, list_entry);
+                struct nvgpu_mem_sgl *next = sgl->next;
-                nvgpu_list_del(&c->list_entry);
-                nvgpu_free(&a->source_allocator, c->base);
+                nvgpu_free(&a->source_allocator, sgl->phys);
-                nvgpu_kmem_cache_free(a->chunk_cache, c);
+                nvgpu_kfree(a->owner->g, sgl);
+                sgl = next;
        }
        nvgpu_kmem_cache_free(a->alloc_cache, alloc);
 fail:
        return NULL;
@@ -521,7 +527,7 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages(
        struct nvgpu_page_allocator *a, u64 len)
 {
        struct nvgpu_page_alloc *alloc = NULL;
-        struct page_alloc_chunk *c;
+        struct nvgpu_mem_sgl *sgl;
        u64 pages;
        int i = 0;
@@ -536,11 +542,15 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages(
        palloc_dbg(a, "Alloc 0x%llx (%llu) id=0x%010llx\n",
                   pages << a->page_shift, pages, alloc->base);
-        nvgpu_list_for_each_entry(c, &alloc->alloc_chunks,
+        sgl = alloc->sgl;
-                                  page_alloc_chunk, list_entry) {
+        while (sgl) {
                palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
-                           i++, c->base, c->length);
+                           i++,
+                           nvgpu_mem_sgl_phys(sgl),
+                           nvgpu_mem_sgl_length(sgl));
+                sgl = sgl->next;
        }
+        palloc_dbg(a, "Alloc done\n");
        return alloc;
 }
@@ -638,11 +648,11 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages_fixed(
        struct nvgpu_page_allocator *a, u64 base, u64 length, u32 unused)
 {
        struct nvgpu_page_alloc *alloc;
-        struct page_alloc_chunk *c;
+        struct nvgpu_mem_sgl *sgl;
        alloc = nvgpu_kmem_cache_alloc(a->alloc_cache);
-        c = nvgpu_kmem_cache_alloc(a->chunk_cache);
+        sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
-        if (!alloc || !c)
+        if (!alloc || !sgl)
                goto fail;
        alloc->base = nvgpu_alloc_fixed(&a->source_allocator, base, length, 0);
@@ -653,17 +663,18 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages_fixed(
        alloc->nr_chunks = 1;
        alloc->length = length;
-        nvgpu_init_list_node(&alloc->alloc_chunks);
+        alloc->sgl = sgl;
-        c->base = alloc->base;
+        sgl->phys   = alloc->base;
-        c->length = length;
+        sgl->dma    = alloc->base;
-        nvgpu_list_add(&c->list_entry, &alloc->alloc_chunks);
+        sgl->length = length;
+        sgl->next   = NULL;
        return alloc;
 fail:
-        if (c)
+        if (sgl)
-                nvgpu_kmem_cache_free(a->chunk_cache, c);
+                nvgpu_kfree(a->owner->g, sgl);
        if (alloc)
                nvgpu_kmem_cache_free(a->alloc_cache, alloc);
        return NULL;
@@ -677,7 +688,7 @@ static u64 nvgpu_page_alloc_fixed(struct nvgpu_allocator *__a,
 {
        struct nvgpu_page_allocator *a = page_allocator(__a);
        struct nvgpu_page_alloc *alloc = NULL;
-        struct page_alloc_chunk *c;
+        struct nvgpu_mem_sgl *sgl;
        u64 aligned_len, pages;
        int i = 0;
@@ -697,10 +708,13 @@ static u64 nvgpu_page_alloc_fixed(struct nvgpu_allocator *__a,
        palloc_dbg(a, "Alloc [fixed] @ 0x%010llx + 0x%llx (%llu)\n",
                   alloc->base, aligned_len, pages);
-        nvgpu_list_for_each_entry(c, &alloc->alloc_chunks,
+        sgl = alloc->sgl;
-                                  page_alloc_chunk, list_entry) {
+        while (sgl) {
                palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
-                           i++, c->base, c->length);
+                           i++,
+                           nvgpu_mem_sgl_phys(sgl),
+                           nvgpu_mem_sgl_length(sgl));
+                sgl = sgl->next;
        }
        a->nr_fixed_allocs++;
@@ -896,11 +910,9 @@ int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
        a->alloc_cache = nvgpu_kmem_cache_create(g,
                                        sizeof(struct nvgpu_page_alloc));
-        a->chunk_cache = nvgpu_kmem_cache_create(g,
-                                        sizeof(struct page_alloc_chunk));
        a->slab_page_cache = nvgpu_kmem_cache_create(g,
                                        sizeof(struct page_alloc_slab_page));
-        if (!a->alloc_cache || !a->chunk_cache || !a->slab_page_cache) {
+        if (!a->alloc_cache || !a->slab_page_cache) {
                err = -ENOMEM;
                goto fail;
        }
@@ -941,8 +953,6 @@ int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
 fail:
        if (a->alloc_cache)
                nvgpu_kmem_cache_destroy(a->alloc_cache);
-        if (a->chunk_cache)
-                nvgpu_kmem_cache_destroy(a->chunk_cache);
        if (a->slab_page_cache)
                nvgpu_kmem_cache_destroy(a->slab_page_cache);
        nvgpu_kfree(g, a);
diff --git a/drivers/gpu/nvgpu/common/pramin.c b/drivers/gpu/nvgpu/common/pramin.c
index 425bfdb4..bb7d930e 100644
--- a/drivers/gpu/nvgpu/common/pramin.c
+++ b/drivers/gpu/nvgpu/common/pramin.c
@@ -84,37 +84,40 @@ void nvgpu_pramin_access_batched(struct gk20a *g, struct nvgpu_mem *mem,
                u32 offset, u32 size, pramin_access_batch_fn loop, u32 **arg)
 {
        struct nvgpu_page_alloc *alloc = NULL;
-        struct page_alloc_chunk *chunk = NULL;
+        struct nvgpu_mem_sgl *sgl;
        u32 byteoff, start_reg, until_end, n;
        alloc = get_vidmem_page_alloc(mem->priv.sgt->sgl);
-        nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
+        sgl = alloc->sgl;
-                        page_alloc_chunk, list_entry) {
+        while (sgl) {
-                if (offset >= chunk->length)
+                if (offset >= nvgpu_mem_sgl_length(sgl)) {
-                        offset -= chunk->length;
+                        offset -= nvgpu_mem_sgl_length(sgl);
-                else
+                        sgl = sgl->next;
+                } else {
                        break;
+                }
        }
        while (size) {
-                byteoff = g->ops.pramin.enter(g, mem, chunk,
+                u32 sgl_len = (u32)nvgpu_mem_sgl_length(sgl);
+                byteoff = g->ops.pramin.enter(g, mem, sgl,
                                              offset / sizeof(u32));
                start_reg = g->ops.pramin.data032_r(byteoff / sizeof(u32));
                until_end = SZ_1M - (byteoff & (SZ_1M - 1));
-                n = min3(size, until_end, (u32)(chunk->length - offset));
+                n = min3(size, until_end, (u32)(sgl_len - offset));
                loop(g, start_reg, n / sizeof(u32), arg);
                /* read back to synchronize accesses */
                gk20a_readl(g, start_reg);
-                g->ops.pramin.exit(g, mem, chunk);
+                g->ops.pramin.exit(g, mem, sgl);
                size -= n;
-                if (n == (chunk->length - offset)) {
+                if (n == (sgl_len - offset)) {
-                        chunk = nvgpu_list_next_entry(chunk, page_alloc_chunk,
+                        sgl = nvgpu_mem_sgl_next(sgl);
-                                        list_entry);
                        offset = 0;
                } else {
                        offset += n;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 7eee2d51..355228db 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -34,6 +34,7 @@ struct gk20a_debug_output;
 struct nvgpu_clk_pll_debug_data;
 struct nvgpu_nvhost_dev;
 struct nvgpu_cpu_time_correlation_sample;
+struct nvgpu_mem_sgl;
 #include <nvgpu/lock.h>
 #include <nvgpu/thread.h>
@@ -70,8 +71,6 @@ struct nvgpu_cpu_time_correlation_sample;
 #endif
 #include "ecc_gk20a.h"
-struct page_alloc_chunk;
 /* PTIMER_REF_FREQ_HZ corresponds to a period of 32 nanoseconds.
    32 ns is the resolution of ptimer. */
 #define PTIMER_REF_FREQ_HZ                      31250000
@@ -701,7 +700,7 @@ struct gpu_ops {
                bool (*support_sparse)(struct gk20a *g);
                u64 (*gmmu_map)(struct vm_gk20a *vm,
                                u64 map_offset,
-                                struct sg_table *sgt,
+                                struct nvgpu_mem_sgl *sgl,
                                u64 buffer_offset,
                                u64 size,
                                int pgsz_idx,
@@ -761,9 +760,9 @@ struct gpu_ops {
                                size_t size);
        struct {
                u32 (*enter)(struct gk20a *g, struct nvgpu_mem *mem,
-                             struct page_alloc_chunk *chunk, u32 w);
+                             struct nvgpu_mem_sgl *sgl, u32 w);
                void (*exit)(struct gk20a *g, struct nvgpu_mem *mem,
-                             struct page_alloc_chunk *chunk);
+                             struct nvgpu_mem_sgl *sgl);
                u32 (*data032_r)(u32 i);
        } pramin;
        struct {
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 97b7aa80..cd34e769 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1151,7 +1151,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
        struct gk20a_fence *gk20a_fence_out = NULL;
        struct gk20a_fence *gk20a_last_fence = NULL;
        struct nvgpu_page_alloc *alloc = NULL;
-        struct page_alloc_chunk *chunk = NULL;
+        struct nvgpu_mem_sgl *sgl = NULL;
        int err = 0;
        if (g->mm.vidmem.ce_ctx_id == (u32)~0)
@@ -1159,16 +1159,16 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
        alloc = get_vidmem_page_alloc(mem->priv.sgt->sgl);
-        nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
+        sgl = alloc->sgl;
-                                  page_alloc_chunk, list_entry) {
+        while (sgl) {
                if (gk20a_last_fence)
                        gk20a_fence_put(gk20a_last_fence);
                err = gk20a_ce_execute_ops(g,
                        g->mm.vidmem.ce_ctx_id,
                        0,
-                        chunk->base,
+                        nvgpu_mem_sgl_phys(sgl),
-                        chunk->length,
+                        nvgpu_mem_sgl_length(sgl),
                        0x00000000,
                        NVGPU_CE_DST_LOCATION_LOCAL_FB,
                        NVGPU_CE_MEMSET,
@@ -1183,6 +1183,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
                }
                gk20a_last_fence = gk20a_fence_out;
+                sgl = nvgpu_mem_sgl_next(sgl);
        }
        if (gk20a_last_fence) {
@@ -1262,10 +1263,10 @@ dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr)
        return addr;
 }
-u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova)
+u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, u64 iova)
 {
        /* ensure it is not vidmem allocation */
-        WARN_ON(is_vidmem_page_alloc((u64)iova));
+        WARN_ON(is_vidmem_page_alloc(iova));
        if (device_is_iommuable(dev_from_gk20a(g)) &&
                        g->ops.mm.get_physical_addr_bits)
@@ -2167,11 +2168,6 @@ u32 gk20a_mm_get_physical_addr_bits(struct gk20a *g)
        return 34;
 }
-u64 gk20a_mm_gpu_phys_addr(struct gk20a *g, u64 phys, u32 flags)
-{
-        return phys;
-}
 const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
                                                      u32 big_page_size)
 {
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index c77bebf8..2fdc1729 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -336,7 +336,6 @@ void gk20a_mm_dump_vm(struct vm_gk20a *vm,
 int gk20a_mm_suspend(struct gk20a *g);
-u64 gk20a_mm_gpu_phys_addr(struct gk20a *g, u64 phys, u32 flags);
 u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova);
 void gk20a_mm_ltc_isr(struct gk20a *g);
@@ -361,29 +360,29 @@ static inline phys_addr_t gk20a_mem_phys(struct nvgpu_mem *mem)
 }
 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
-                        u64 map_offset,
+                          u64 map_offset,
-                        struct sg_table *sgt,
+                          struct nvgpu_mem_sgl *sgl,
-                        u64 buffer_offset,
+                          u64 buffer_offset,
-                        u64 size,
+                          u64 size,
-                        int pgsz_idx,
+                          int pgsz_idx,
-                        u8 kind_v,
+                          u8 kind_v,
-                        u32 ctag_offset,
+                          u32 ctag_offset,
-                        u32 flags,
+                          u32 flags,
-                        int rw_flag,
+                          int rw_flag,
-                        bool clear_ctags,
+                          bool clear_ctags,
-                        bool sparse,
+                          bool sparse,
-                        bool priv,
+                          bool priv,
-                        struct vm_gk20a_mapping_batch *batch,
+                          struct vm_gk20a_mapping_batch *batch,
-                        enum nvgpu_aperture aperture);
+                          enum nvgpu_aperture aperture);
 void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
-                        u64 vaddr,
+                             u64 vaddr,
-                        u64 size,
+                             u64 size,
-                        int pgsz_idx,
+                             int pgsz_idx,
-                        bool va_allocated,
+                             bool va_allocated,
-                        int rw_flag,
+                             int rw_flag,
-                        bool sparse,
+                             bool sparse,
-                        struct vm_gk20a_mapping_batch *batch);
+                             struct vm_gk20a_mapping_batch *batch);
 struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf);
 void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
diff --git a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c
index 9d19e9e5..8a34a63c 100644
--- a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c
@@ -26,9 +26,9 @@
 /* WARNING: returns pramin_window_lock taken, complement with pramin_exit() */
 u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
-                              struct page_alloc_chunk *chunk, u32 w)
+                       struct nvgpu_mem_sgl *sgl, u32 w)
 {
-        u64 bufbase = chunk->base;
+        u64 bufbase = nvgpu_mem_sgl_phys(sgl);
        u64 addr = bufbase + w * sizeof(u32);
        u32 hi = (u32)((addr & ~(u64)0xfffff)
                >> bus_bar0_window_target_bar0_window_base_shift_v());
@@ -40,8 +40,9 @@ u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
        gk20a_dbg(gpu_dbg_mem,
                        "0x%08x:%08x begin for %p,%p at [%llx,%llx] (sz %llx)",
-                        hi, lo, mem, chunk, bufbase,
+                        hi, lo, mem, sgl, bufbase,
-                        bufbase + chunk->length, chunk->length);
+                        bufbase + nvgpu_mem_sgl_phys(sgl),
+                        nvgpu_mem_sgl_length(sgl));
        WARN_ON(!bufbase);
@@ -57,9 +58,9 @@ u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
 }
 void gk20a_pramin_exit(struct gk20a *g, struct nvgpu_mem *mem,
-                              struct page_alloc_chunk *chunk)
+                       struct nvgpu_mem_sgl *sgl)
 {
-        gk20a_dbg(gpu_dbg_mem, "end for %p,%p", mem, chunk);
+        gk20a_dbg(gpu_dbg_mem, "end for %p,%p", mem, sgl);
        nvgpu_spinlock_release(&g->mm.pramin_window_lock);
 }
diff --git a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h
index 1a1ac871..fc5ba919 100644
--- a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h
@@ -19,10 +19,10 @@
 struct gk20a;
 struct nvgpu_mem;
-struct page_alloc_chunk;
+struct nvgpu_mem_sgl;
 u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
-                              struct page_alloc_chunk *chunk, u32 w);
+                       struct nvgpu_mem_sgl *sgl, u32 w);
 void gk20a_pramin_exit(struct gk20a *g, struct nvgpu_mem *mem,
-                              struct page_alloc_chunk *chunk);
+                       struct nvgpu_mem_sgl *sgl);
 #endif
diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
index fc27b120..c276f5a6 100644
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
@@ -904,7 +904,7 @@ int gr_gp10b_alloc_buffer(struct vm_gk20a *vm, size_t size,
        mem->gpu_va = nvgpu_gmmu_map(vm,
                                mem,
-                                size,
+                                mem->aligned_size,
                                NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
                                gk20a_mem_flag_none,
                                false,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index de129a5f..11060300 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -27,8 +27,6 @@
 #include <nvgpu/gmmu_t19x.h>
 #endif
-struct scatterlist;
 /*
 * This is the GMMU API visible to blocks outside of the GMMU. Basically this
 * API supports all the different types of mappings that might be done in the
diff --git a/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h
index e2d4d336..f96c2801 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h
@@ -32,6 +32,8 @@ struct nvgpu_mem_priv {
 };
 u64 nvgpu_mem_get_addr_sgl(struct gk20a *g, struct scatterlist *sgl);
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create(struct gk20a *g,
+                                           struct sg_table *sgt);
 /**
 * __nvgpu_mem_create_from_pages - Create an nvgpu_mem from physical pages.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h
index 4cac3e70..cfce8c5b 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/log.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/log.h
@@ -71,6 +71,7 @@ enum nvgpu_log_categories {
        gpu_dbg_pd_cache   = BIT(20),   /* PD cache traces. */
        gpu_dbg_alloc      = BIT(21),   /* Allocator debugging. */
        gpu_dbg_dma        = BIT(22),   /* DMA allocation prints. */
+        gpu_dbg_sgl        = BIT(23),   /* SGL related traces. */
        gpu_dbg_mem        = BIT(31),   /* memory accesses; very verbose. */
 };
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
index a112623e..7d19cf81 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
@@ -33,6 +33,8 @@ struct gk20a;
 struct nvgpu_allocator;
 struct nvgpu_gmmu_attrs;
+#define NVGPU_MEM_DMA_ERROR             (~0ULL)
 /*
 * Real location of a buffer - nvgpu_aperture_mask() will deduce what will be
 * told to the gpu about the aperture, but this flag designates where the
@@ -44,6 +46,28 @@ enum nvgpu_aperture {
        APERTURE_VIDMEM
 };
+/*
+ * This struct holds the necessary information for describing a struct
+ * nvgpu_mem's scatter gather list.
+ *
+ * These are created in a platform dependent way. As a result the function
+ * definition for allocating these lives in the <nvgpu/_OS_/nvgpu_mem.h> file.
+ */
+struct nvgpu_mem_sgl {
+        /*
+         * Internally this is implemented as a singly linked list.
+         */
+        struct nvgpu_mem_sgl    *next;
+        /*
+         * There is both a phys address and a DMA address since some systems,
+         * for example ones with an IOMMU, may see these as different addresses.
+         */
+        u64                      phys;
+        u64                      dma;
+        u64                      length;
+};
 struct nvgpu_mem {
        /*
         * Populated for all nvgpu_mem structs - vidmem or system.
@@ -176,6 +200,27 @@ int nvgpu_mem_create_from_mem(struct gk20a *g,
                              struct nvgpu_mem *dest, struct nvgpu_mem *src,
                              int start_page, int nr_pages);
+/**
+ * nvgpu_mem_sgl_create_from_mem - Create a scatter list from an nvgpu_mem.
+ *
+ * @g   - The GPU.
+ * @mem - The source memory allocation to use.
+ *
+ * Create a scatter gather list from the passed @mem struct. This list lets the
+ * calling code iterate across each chunk of a DMA allocation for when that DMA
+ * allocation is not completely contiguous.
+ */
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create_from_mem(struct gk20a *g,
+                                                    struct nvgpu_mem *mem);
+void nvgpu_mem_sgl_free(struct gk20a *g, struct nvgpu_mem_sgl *sgl);
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_next(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_phys(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_dma(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_length(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_gpu_addr(struct gk20a *g, struct nvgpu_mem_sgl *sgl,
+                           struct nvgpu_gmmu_attrs *attrs);
 /*
 * Buffer accessors - wrap between begin() and end() if there is no permanent
 * kernel mapping for this buffer.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h b/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h
index 9a5ef8d3..de83ca7f 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h
@@ -18,6 +18,7 @@
 #define PAGE_ALLOCATOR_PRIV_H
 #include <nvgpu/allocator.h>
+#include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/kmem.h>
 #include <nvgpu/list.h>
 #include <nvgpu/rbtree.h>
@@ -83,27 +84,17 @@ page_alloc_slab_page_from_list_entry(struct nvgpu_list_node *node)
        ((uintptr_t)node - offsetof(struct page_alloc_slab_page, list_entry));
 };
-struct page_alloc_chunk {
-        struct nvgpu_list_node list_entry;
-        u64 base;
-        u64 length;
-};
-static inline struct page_alloc_chunk *
-page_alloc_chunk_from_list_entry(struct nvgpu_list_node *node)
-{
-        return (struct page_alloc_chunk *)
-        ((uintptr_t)node - offsetof(struct page_alloc_chunk, list_entry));
-};
 /*
 * Struct to handle internal management of page allocation. It holds a list
 * of the chunks of pages that make up the overall allocation - much like a
 * scatter gather table.
 */
 struct nvgpu_page_alloc {
-        struct nvgpu_list_node alloc_chunks;
+        /*
+         * nvgpu_mem_sgl for describing the actual allocation. Convenient for
+         * GMMU mapping.
+         */
+        struct nvgpu_mem_sgl *sgl;
        int nr_chunks;
        u64 length;
@@ -156,7 +147,6 @@ struct nvgpu_page_allocator {
        int nr_slabs;
        struct nvgpu_kmem_cache *alloc_cache;
-        struct nvgpu_kmem_cache *chunk_cache;
        struct nvgpu_kmem_cache *slab_page_cache;
        u64 flags;
diff --git a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c
index 85c436e5..ee9b791a 100644
--- a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c
+++ b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c
@@ -13,7 +13,6 @@
 * more details.
 */
-#include <linux/dma-mapping.h>
 #include "vgpu/vgpu.h"
 #include "vgpu_mm_gp10b.h"
 #include "gk20a/mm_gk20a.h"
@@ -41,7 +40,7 @@ static inline int add_mem_desc(struct tegra_vgpu_mem_desc *mem_desc,
 static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
                                u64 map_offset,
-                                struct sg_table *sgt,
+                                struct nvgpu_mem_sgl *sgl,
                                u64 buffer_offset,
                                u64 size,
                                int pgsz_idx,
@@ -61,10 +60,9 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
        struct tegra_vgpu_as_map_ex_params *p = &msg.params.as_map_ex;
        struct tegra_vgpu_mem_desc *mem_desc;
        u32 page_size  = vm->gmmu_page_sizes[pgsz_idx];
+        u64 buffer_size = PAGE_ALIGN(size);
        u64 space_to_skip = buffer_offset;
-        u64 buffer_size = 0;
        u32 mem_desc_count = 0, i;
-        struct scatterlist *sgl;
        void *handle = NULL;
        size_t oob_size;
        u8 prot;
@@ -73,7 +71,7 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
        /* FIXME: add support for sparse mappings */
-        if (WARN_ON(!sgt) || WARN_ON(!g->mm.bypass_smmu))
+        if (WARN_ON(!sgl) || WARN_ON(!g->mm.bypass_smmu))
                return 0;
        if (space_to_skip & (page_size - 1))
@@ -100,33 +98,36 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
                goto fail;
        }
-        sgl = sgt->sgl;
+        while (sgl) {
-        while (space_to_skip && sgl &&
+                u64 phys_addr;
-                (space_to_skip + page_size > sgl->length)) {
+                u64 chunk_length;
-                space_to_skip -= sgl->length;
-                sgl = sg_next(sgl);
+                /*
-        }
+                 * Cut out sgl ents for space_to_skip.
-        WARN_ON(!sgl);
+                 */
+                if (space_to_skip &&
+                    space_to_skip >= nvgpu_mem_sgl_length(sgl)) {
+                        space_to_skip -= nvgpu_mem_sgl_length(sgl);
+                        sgl = nvgpu_mem_sgl_next(sgl);
+                        continue;
+                }
-        if (add_mem_desc(&mem_desc[mem_desc_count++],
+                phys_addr = nvgpu_mem_sgl_phys(sgl) + space_to_skip;
-                        sg_phys(sgl) + space_to_skip,
+                chunk_length = min(size,
-                        sgl->length - space_to_skip,
+                                   nvgpu_mem_sgl_length(sgl) - space_to_skip);
-                        &oob_size)) {
-                err = -ENOMEM;
-                goto fail;
-        }
-        buffer_size += sgl->length - space_to_skip;
-        sgl = sg_next(sgl);
+                if (add_mem_desc(&mem_desc[mem_desc_count++], phys_addr,
-        while (sgl && buffer_size < size) {
+                                 chunk_length, &oob_size)) {
-                if (add_mem_desc(&mem_desc[mem_desc_count++], sg_phys(sgl),
-                                sgl->length, &oob_size)) {
                        err = -ENOMEM;
                        goto fail;
                }
-                buffer_size += sgl->length;
+                space_to_skip = 0;
-                sgl = sg_next(sgl);
+                size -= chunk_length;
+                sgl   = nvgpu_mem_sgl_next(sgl);
+                if (size == 0)
+                        break;
        }
        if (rw_flag == gk20a_mem_flag_read_only)
@@ -153,7 +154,7 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
        msg.handle = vgpu_get_handle(g);
        p->handle = vm->handle;
        p->gpu_va = map_offset;
-        p->size = size;
+        p->size = buffer_size;
        p->mem_desc_count = mem_desc_count;
        p->pgsz_idx = pgsz_idx;
        p->iova = 0;
diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
index ef9e00c8..5da6f158 100644
--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -78,7 +78,7 @@ int vgpu_init_mm_support(struct gk20a *g)
 static u64 vgpu_locked_gmmu_map(struct vm_gk20a *vm,
                                u64 map_offset,
-                                struct sg_table *sgt,
+                                struct nvgpu_mem_sgl *sgl,
                                u64 buffer_offset,
                                u64 size,
                                int pgsz_idx,
@@ -98,7 +98,7 @@ static u64 vgpu_locked_gmmu_map(struct vm_gk20a *vm,
        struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
        struct tegra_vgpu_cmd_msg msg;
        struct tegra_vgpu_as_map_params *p = &msg.params.as_map;
-        u64 addr = nvgpu_mem_get_addr_sgl(g, sgt->sgl);
+        u64 addr = nvgpu_mem_sgl_gpu_addr(g, sgl, NULL);
        u8 prot;
        gk20a_dbg_fn("");