From 0090ee5aca268a3c359f34c74b8c521df3bd8593 Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Thu, 25 May 2017 16:56:50 -0700
Subject: gpu: nvgpu: nvgpu SGL implementation

The last major item preventing the core MM code in the nvgpu
driver from being platform agnostic is the usage of Linux
scattergather tables and scattergather lists. These data
structures are used throughout the mapping code to handle
discontiguous DMA allocations and also overloaded to represent
VIDMEM allocs.

The notion of a scatter gather table is crucial to a HW device
that can handle discontiguous DMA. The GPU has a MMU which
allows the GPU to do page gathering and present a virtually
contiguous buffer to the GPU HW. As a result it makes sense
for the GPU driver to use some sort of scatter gather concept
so maximize memory usage efficiency.

To that end this patch keeps the notion of a scatter gather
list but implements it in the nvgpu common code. It is based
heavily on the Linux SGL concept. It is a singly linked list
of blocks - each representing a chunk of memory. To map or
use a DMA allocation SW must iterate over each block in the
SGL.

This patch implements the most basic level of support for this
data structure. There are certainly easy optimizations that
could be done to speed up the current implementation. However,
this patches' goal is to simply divest the core MM code from
any last Linux'isms. Speed and efficiency come next.

Change-Id: Icf44641db22d87fa1d003debbd9f71b605258e42
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1530867
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/Makefile.nvgpu                  |   1 +
 drivers/gpu/nvgpu/common/linux/nvgpu_mem.c        | 114 +++++++++++++++++
 drivers/gpu/nvgpu/common/linux/vm.c               |  25 ++--
 drivers/gpu/nvgpu/common/mm/gmmu.c                | 109 ++++++++---------
 drivers/gpu/nvgpu/common/mm/nvgpu_mem.c           |  73 +++++++++++
 drivers/gpu/nvgpu/common/mm/page_allocator.c      | 142 ++++++++++++----------
 drivers/gpu/nvgpu/common/pramin.c                 |  27 ++--
 drivers/gpu/nvgpu/gk20a/gk20a.h                   |   9 +-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c                |  20 ++-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h                |  43 ++++---
 drivers/gpu/nvgpu/gk20a/pramin_gk20a.c            |  13 +-
 drivers/gpu/nvgpu/gk20a/pramin_gk20a.h            |   6 +-
 drivers/gpu/nvgpu/gp10b/gr_gp10b.c                |   2 +-
 drivers/gpu/nvgpu/include/nvgpu/gmmu.h            |   2 -
 drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h |   2 +
 drivers/gpu/nvgpu/include/nvgpu/log.h             |   1 +
 drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h       |  45 +++++++
 drivers/gpu/nvgpu/include/nvgpu/page_allocator.h  |  22 +---
 drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c      |  55 +++++----
 drivers/gpu/nvgpu/vgpu/mm_vgpu.c                  |   4 +-
 20 files changed, 474 insertions(+), 241 deletions(-)
 create mode 100644 drivers/gpu/nvgpu/common/mm/nvgpu_mem.c

diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index d02870fb..6e475fcb 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -55,6 +55,7 @@ nvgpu-y := \
 	common/mm/pd_cache.o \
 	common/mm/vm.o \
 	common/mm/vm_area.o \
+	common/mm/nvgpu_mem.o \
 	common/bus.o \
 	common/enabled.o \
 	common/pramin.o \
diff --git a/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c b/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
index e4991d0d..eb54f3fd 100644
--- a/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
+++ b/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
@@ -21,6 +21,7 @@
 #include <nvgpu/log.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/kmem.h>
 
 #include <nvgpu/linux/dma.h>
 
@@ -395,3 +396,116 @@ int __nvgpu_mem_create_from_pages(struct gk20a *g, struct nvgpu_mem *dest,
 
 	return 0;
 }
+
+static struct nvgpu_mem_sgl *__nvgpu_mem_sgl_dup(struct gk20a *g,
+						 struct nvgpu_mem_sgl *sgl)
+{
+	struct nvgpu_mem_sgl *head, *next;
+
+	head = nvgpu_kzalloc(g, sizeof(*sgl));
+	if (!head)
+		return NULL;
+
+	next = head;
+	while (true) {
+		nvgpu_log(g, gpu_dbg_sgl,
+			  "  phys: 0x%-12llx dma: 0x%-12llx len: 0x%llx",
+			  sgl->phys, sgl->dma, sgl->length);
+
+		next->dma    = sgl->dma;
+		next->phys   = sgl->phys;
+		next->length = sgl->length;
+		next->next   = NULL;
+
+		sgl = nvgpu_mem_sgl_next(sgl);
+		if (!sgl)
+			break;
+
+		next->next = nvgpu_kzalloc(g, sizeof(*sgl));
+		if (!next->next) {
+			nvgpu_mem_sgl_free(g, head);
+			return NULL;
+		}
+		next = next->next;
+	}
+
+	return head;
+}
+
+static struct nvgpu_mem_sgl *__nvgpu_mem_sgl_create_from_vidmem(
+	struct gk20a *g,
+	struct scatterlist *linux_sgl)
+{
+	struct nvgpu_page_alloc *vidmem_alloc;
+
+	vidmem_alloc = get_vidmem_page_alloc(linux_sgl);
+	if (!vidmem_alloc)
+		return NULL;
+
+	nvgpu_log(g, gpu_dbg_sgl, "Vidmem sgl:");
+
+	return __nvgpu_mem_sgl_dup(g, vidmem_alloc->sgl);
+}
+
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create(struct gk20a *g,
+					   struct sg_table *sgt)
+{
+	struct nvgpu_mem_sgl *head, *sgl, *next;
+	struct scatterlist *linux_sgl = sgt->sgl;
+
+	if (is_vidmem_page_alloc(sg_dma_address(linux_sgl)))
+		return __nvgpu_mem_sgl_create_from_vidmem(g, linux_sgl);
+
+	head = nvgpu_kzalloc(g, sizeof(*sgl));
+	if (!head)
+		return NULL;
+
+	nvgpu_log(g, gpu_dbg_sgl, "Making sgl:");
+
+	sgl = head;
+	while (true) {
+		sgl->dma    = sg_dma_address(linux_sgl);
+		sgl->phys   = sg_phys(linux_sgl);
+		sgl->length = linux_sgl->length;
+
+		/*
+		 * We don't like offsets in the pages here. This will cause
+		 * problems.
+		 */
+		if (WARN_ON(linux_sgl->offset)) {
+			nvgpu_mem_sgl_free(g, head);
+			return NULL;
+		}
+
+		nvgpu_log(g, gpu_dbg_sgl,
+			  "  phys: 0x%-12llx dma: 0x%-12llx len: 0x%llx",
+			  sgl->phys, sgl->dma, sgl->length);
+
+		/*
+		 * When there's no more SGL ents for the Linux SGL we are
+		 * done. Don't bother making any more SGL ents for the nvgpu
+		 * SGL.
+		 */
+		linux_sgl = sg_next(linux_sgl);
+		if (!linux_sgl)
+			break;
+
+		next = nvgpu_kzalloc(g, sizeof(*sgl));
+		if (!next) {
+			nvgpu_mem_sgl_free(g, head);
+			return NULL;
+		}
+
+		sgl->next = next;
+		sgl = next;
+	}
+
+	nvgpu_log(g, gpu_dbg_sgl, "Done!");
+	return head;
+}
+
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create_from_mem(struct gk20a *g,
+						    struct nvgpu_mem *mem)
+{
+	return nvgpu_mem_sgl_create(g, mem->priv.sgt);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/vm.c b/drivers/gpu/nvgpu/common/linux/vm.c
index 86d8bec9..4a4429dc 100644
--- a/drivers/gpu/nvgpu/common/linux/vm.c
+++ b/drivers/gpu/nvgpu/common/linux/vm.c
@@ -21,8 +21,11 @@
 #include <nvgpu/lock.h>
 #include <nvgpu/rbtree.h>
 #include <nvgpu/vm_area.h>
+#include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/page_allocator.h>
 
+#include <nvgpu/linux/nvgpu_mem.h>
+
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
 #include "gk20a/kind_gk20a.h"
@@ -66,17 +69,19 @@ static u64 nvgpu_get_buffer_alignment(struct gk20a *g, struct scatterlist *sgl,
 
 	if (aperture == APERTURE_VIDMEM) {
 		struct nvgpu_page_alloc *alloc = get_vidmem_page_alloc(sgl);
-		struct page_alloc_chunk *chunk = NULL;
+		struct nvgpu_mem_sgl *sgl_vid = alloc->sgl;
 
-		nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-					page_alloc_chunk, list_entry) {
-			chunk_align = 1ULL << __ffs(chunk->base |
-						    chunk->length);
+		while (sgl_vid) {
+			chunk_align = 1ULL <<
+				__ffs(nvgpu_mem_sgl_phys(sgl_vid) |
+				nvgpu_mem_sgl_length(sgl_vid));
 
 			if (align)
 				align = min(align, chunk_align);
 			else
 				align = chunk_align;
+
+			sgl_vid = nvgpu_mem_sgl_next(sgl_vid);
 		}
 
 		return align;
@@ -237,6 +242,7 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 	struct nvgpu_vm_area *vm_area = NULL;
 	u32 ctag_offset;
 	enum nvgpu_aperture aperture;
+	struct nvgpu_mem_sgl *nvgpu_sgl;
 
 	/*
 	 * The kind used as part of the key for map caching. HW may
@@ -393,9 +399,12 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 		ctag_offset += buffer_offset >>
 			       ilog2(g->ops.fb.compression_page_size(g));
 
+	nvgpu_sgl = nvgpu_mem_sgl_create(g, bfr.sgt);
+
 	/* update gmmu ptes */
-	map_offset = g->ops.mm.gmmu_map(vm, map_offset,
-					bfr.sgt,
+	map_offset = g->ops.mm.gmmu_map(vm,
+					map_offset,
+					nvgpu_sgl,
 					buffer_offset, /* sg offset */
 					mapping_size,
 					bfr.pgsz_idx,
@@ -410,6 +419,8 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 	if (!map_offset)
 		goto clean_up;
 
+	nvgpu_mem_sgl_free(g, nvgpu_sgl);
+
 	mapped_buffer = nvgpu_kzalloc(g, sizeof(*mapped_buffer));
 	if (!mapped_buffer) {
 		nvgpu_warn(g, "oom allocating tracking buffer");
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index 7f486d68..41f5acdd 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -65,11 +65,14 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
 	struct gk20a *g = gk20a_from_vm(vm);
 	u64 vaddr;
 
-	struct sg_table *sgt = mem->priv.sgt;
+	struct nvgpu_mem_sgl *sgl = nvgpu_mem_sgl_create_from_mem(g, mem);
+
+	if (!sgl)
+		return -ENOMEM;
 
 	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
 	vaddr = g->ops.mm.gmmu_map(vm, addr,
-				   sgt,    /* sg table */
+				   sgl,    /* sg list */
 				   0,      /* sg offset */
 				   size,
 				   gmmu_page_size_kernel,
@@ -82,8 +85,11 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
 				   NULL,   /* mapping_batch handle */
 				   aperture);
 	nvgpu_mutex_release(&vm->update_gmmu_lock);
+
+	nvgpu_mem_sgl_free(g, sgl);
+
 	if (!vaddr) {
-		nvgpu_err(g, "failed to allocate va space");
+		nvgpu_err(g, "failed to map buffer!");
 		return 0;
 	}
 
@@ -91,7 +97,7 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
 }
 
 /*
- * Convenience wrapper over __nvgpu_gmmu_map() for non-fixed mappings.
+ * Map a nvgpu_mem into the GMMU. This is for kernel space to use.
  */
 u64 nvgpu_gmmu_map(struct vm_gk20a *vm,
 		   struct nvgpu_mem *mem,
@@ -106,7 +112,7 @@ u64 nvgpu_gmmu_map(struct vm_gk20a *vm,
 }
 
 /*
- * Like nvgpu_gmmu_map() except it can work on a fixed address instead.
+ * Like nvgpu_gmmu_map() except this can work on a fixed address.
  */
 u64 nvgpu_gmmu_map_fixed(struct vm_gk20a *vm,
 			 struct nvgpu_mem *mem,
@@ -407,7 +413,7 @@ static int __set_pd_level(struct vm_gk20a *vm,
 		 */
 		target_addr = next_pd ?
 			nvgpu_pde_phys_addr(g, next_pd) :
-			g->ops.mm.gpu_phys_addr(g, attrs, phys_addr);
+			phys_addr;
 
 		l->update_entry(vm, l,
 				pd, pd_idx,
@@ -458,18 +464,16 @@ static int __set_pd_level(struct vm_gk20a *vm,
  * VIDMEM version of the update_ptes logic.
  */
 static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
-						 struct sg_table *sgt,
+						 struct nvgpu_mem_sgl *sgl,
 						 u64 space_to_skip,
 						 u64 virt_addr,
 						 u64 length,
 						 struct nvgpu_gmmu_attrs *attrs)
 {
-	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
 	u64 phys_addr, chunk_length;
 	int err = 0;
 
-	if (!sgt) {
+	if (!sgl) {
 		/*
 		 * This is considered an unmap. Just pass in 0 as the physical
 		 * address for the entire GPU range.
@@ -482,22 +486,21 @@ static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
 		return err;
 	}
 
-	alloc = get_vidmem_page_alloc(sgt->sgl);
-
 	/*
 	 * Otherwise iterate across all the chunks in this allocation and
 	 * map them.
 	 */
-	nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-				  page_alloc_chunk, list_entry) {
+	while (sgl) {
 		if (space_to_skip &&
-		    space_to_skip >= chunk->length) {
-			space_to_skip -= chunk->length;
+		    space_to_skip >= nvgpu_mem_sgl_length(sgl)) {
+			space_to_skip -= nvgpu_mem_sgl_length(sgl);
+			sgl = nvgpu_mem_sgl_next(sgl);
 			continue;
 		}
 
-		phys_addr = chunk->base + space_to_skip;
-		chunk_length = min(length, (chunk->length - space_to_skip));
+		phys_addr = nvgpu_mem_sgl_phys(sgl) + space_to_skip;
+		chunk_length = min(length, (nvgpu_mem_sgl_length(sgl) -
+					    space_to_skip));
 
 		err = __set_pd_level(vm, &vm->pdb,
 				     0,
@@ -518,23 +521,24 @@ static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
 
 		if (length == 0)
 			break;
+
+		sgl = nvgpu_mem_sgl_next(sgl);
 	}
 
 	return err;
 }
 
 static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
-						 struct sg_table *sgt,
+						 struct nvgpu_mem_sgl *sgl,
 						 u64 space_to_skip,
 						 u64 virt_addr,
 						 u64 length,
 						 struct nvgpu_gmmu_attrs *attrs)
 {
 	int err;
-	struct scatterlist *sgl;
 	struct gk20a *g = gk20a_from_vm(vm);
 
-	if (!sgt) {
+	if (!sgl) {
 		/*
 		 * This is considered an unmap. Just pass in 0 as the physical
 		 * address for the entire GPU range.
@@ -548,19 +552,15 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
 	}
 
 	/*
-	 * At this point we have a Linux scatter-gather list pointing to some
-	 * number of discontiguous chunks of memory. Iterate over that list and
+	 * At this point we have a scatter-gather list pointing to some number
+	 * of discontiguous chunks of memory. We must iterate over that list and
 	 * generate a GMMU map call for each chunk. There are two possibilities:
-	 * either the IOMMU is enabled or not. When the IOMMU is enabled the
+	 * either an IOMMU is enabled or not. When an IOMMU is enabled the
 	 * mapping is simple since the "physical" address is actually a virtual
-	 * IO address and will be contiguous. The no-IOMMU case is more
-	 * complicated. We will have to iterate over the SGT and do a separate
-	 * map for each chunk of the SGT.
+	 * IO address and will be contiguous.
 	 */
-	sgl = sgt->sgl;
-
 	if (!g->mm.bypass_smmu) {
-		u64 io_addr = nvgpu_mem_get_addr_sgl(g, sgl);
+		u64 io_addr = nvgpu_mem_sgl_gpu_addr(g, sgl, attrs);
 
 		io_addr += space_to_skip;
 
@@ -585,14 +585,16 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
 		/*
 		 * Cut out sgl ents for space_to_skip.
 		 */
-		if (space_to_skip && space_to_skip >= sgl->length) {
-			space_to_skip -= sgl->length;
-			sgl = sg_next(sgl);
+		if (space_to_skip &&
+		    space_to_skip >= nvgpu_mem_sgl_length(sgl)) {
+			space_to_skip -= nvgpu_mem_sgl_length(sgl);
+			sgl = nvgpu_mem_sgl_next(sgl);
 			continue;
 		}
 
-		phys_addr = sg_phys(sgl) + space_to_skip;
-		chunk_length = min(length, sgl->length - space_to_skip);
+		phys_addr = nvgpu_mem_sgl_phys(sgl) + space_to_skip;
+		chunk_length = min(length,
+				   nvgpu_mem_sgl_length(sgl) - space_to_skip);
 
 		err = __set_pd_level(vm, &vm->pdb,
 				     0,
@@ -600,13 +602,11 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
 				     virt_addr,
 				     chunk_length,
 				     attrs);
-		if (err)
-			return err;
 
 		space_to_skip = 0;
 		virt_addr += chunk_length;
 		length    -= chunk_length;
-		sgl        = sg_next(sgl);
+		sgl        = nvgpu_mem_sgl_next(sgl);
 
 		if (length == 0)
 			break;
@@ -624,22 +624,20 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
  * implementations. But the logic around that is generic to all chips. Every
  * chip has some number of PDE levels and then a PTE level.
  *
- * Each chunk of the incoming SGT is sent to the chip specific implementation
+ * Each chunk of the incoming SGL is sent to the chip specific implementation
  * of page table update.
  *
  * [*] Note: the "physical" address may actually be an IO virtual address in the
  *     case of SMMU usage.
  */
 static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
-					  struct sg_table *sgt,
+					  struct nvgpu_mem_sgl *sgl,
 					  u64 space_to_skip,
 					  u64 virt_addr,
 					  u64 length,
 					  struct nvgpu_gmmu_attrs *attrs)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
-	struct nvgpu_page_alloc *alloc;
-	u64 phys_addr = 0;
 	u32 page_size;
 	int err;
 
@@ -665,25 +663,16 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
 		return err;
 	}
 
-	if (sgt) {
-		if (attrs->aperture == APERTURE_VIDMEM) {
-			alloc = get_vidmem_page_alloc(sgt->sgl);
-
-			phys_addr = alloc->base;
-		} else
-			phys_addr = nvgpu_mem_get_addr_sgl(g, sgt->sgl);
-	}
-
 	__gmmu_dbg(g, attrs,
 		   "vm=%s "
 		   "%-5s GPU virt %#-12llx +%#-9llx    phys %#-12llx "
 		   "phys offset: %#-4llx;  pgsz: %3dkb perm=%-2s | "
 		   "kind=%#02x APT=%-6s %c%c%c%c%c",
 		   vm->name,
-		   sgt ? "MAP" : "UNMAP",
+		   sgl ? "MAP" : "UNMAP",
 		   virt_addr,
 		   length,
-		   phys_addr,
+		   sgl ? nvgpu_mem_sgl_phys(sgl) : 0,
 		   space_to_skip,
 		   page_size >> 10,
 		   nvgpu_gmmu_perm_str(attrs->rw_flag),
@@ -696,19 +685,19 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
 		   attrs->valid     ? 'V' : '-');
 
 	/*
-	 * Handle VIDMEM progamming. Currently uses a different scatter list
-	 * format.
+	 * For historical reasons these are separate, but soon these will be
+	 * unified.
 	 */
 	if (attrs->aperture == APERTURE_VIDMEM)
 		err = __nvgpu_gmmu_update_page_table_vidmem(vm,
-							    sgt,
+							    sgl,
 							    space_to_skip,
 							    virt_addr,
 							    length,
 							    attrs);
 	else
 		err = __nvgpu_gmmu_update_page_table_sysmem(vm,
-							    sgt,
+							    sgl,
 							    space_to_skip,
 							    virt_addr,
 							    length,
@@ -717,7 +706,7 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
 	unmap_gmmu_pages(g, &vm->pdb);
 	nvgpu_smp_mb();
 
-	__gmmu_dbg(g, attrs, "%-5s Done!", sgt ? "MAP" : "UNMAP");
+	__gmmu_dbg(g, attrs, "%-5s Done!", sgl ? "MAP" : "UNMAP");
 
 	return err;
 }
@@ -736,7 +725,7 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
  */
 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 			  u64 vaddr,
-			  struct sg_table *sgt,
+			  struct nvgpu_mem_sgl *sgl,
 			  u64 buffer_offset,
 			  u64 size,
 			  int pgsz_idx,
@@ -785,7 +774,7 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 		allocated = true;
 	}
 
-	err = __nvgpu_gmmu_update_page_table(vm, sgt, buffer_offset,
+	err = __nvgpu_gmmu_update_page_table(vm, sgl, buffer_offset,
 					     vaddr, size, &attrs);
 	if (err) {
 		nvgpu_err(g, "failed to update ptes on map");
diff --git a/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c b/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c
new file mode 100644
index 00000000..7296c673
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <nvgpu/kmem.h>
+#include <nvgpu/nvgpu_mem.h>
+
+#include "gk20a/gk20a.h"
+
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_next(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->next;
+}
+
+u64 nvgpu_mem_sgl_phys(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->phys;
+}
+
+u64 nvgpu_mem_sgl_dma(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->dma;
+}
+
+u64 nvgpu_mem_sgl_length(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->length;
+}
+
+/*
+ * This builds a GPU address for the %sgl based on whether an IOMMU is present
+ * or not. It also handles turning the physical address into the true GPU
+ * physical address that should be programmed into the page tables.
+ */
+u64 nvgpu_mem_sgl_gpu_addr(struct gk20a *g, struct nvgpu_mem_sgl *sgl,
+			   struct nvgpu_gmmu_attrs *attrs)
+{
+	if (nvgpu_mem_sgl_dma(sgl) == 0)
+		return g->ops.mm.gpu_phys_addr(g, attrs,
+					       nvgpu_mem_sgl_phys(sgl));
+
+	if (nvgpu_mem_sgl_dma(sgl) == DMA_ERROR_CODE)
+		return 0;
+
+	return gk20a_mm_smmu_vaddr_translate(g, nvgpu_mem_sgl_dma(sgl));
+}
+
+void nvgpu_mem_sgl_free(struct gk20a *g, struct nvgpu_mem_sgl *sgl)
+{
+	struct nvgpu_mem_sgl *next;
+
+	/*
+	 * Free each of the elements. We expect each element to have been
+	 * nvgpu_k[mz]alloc()ed.
+	 */
+	while (sgl) {
+		next = nvgpu_mem_sgl_next(sgl);
+		nvgpu_kfree(g, sgl);
+		sgl = next;
+	}
+}
diff --git a/drivers/gpu/nvgpu/common/mm/page_allocator.c b/drivers/gpu/nvgpu/common/mm/page_allocator.c
index 72ff8f2d..6d92b457 100644
--- a/drivers/gpu/nvgpu/common/mm/page_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/page_allocator.c
@@ -147,19 +147,16 @@ static void __nvgpu_free_pages(struct nvgpu_page_allocator *a,
 			       struct nvgpu_page_alloc *alloc,
 			       bool free_buddy_alloc)
 {
-	struct page_alloc_chunk *chunk;
+	struct nvgpu_mem_sgl *sgl = alloc->sgl;
 
-	while (!nvgpu_list_empty(&alloc->alloc_chunks)) {
-		chunk = nvgpu_list_first_entry(&alloc->alloc_chunks,
-					 page_alloc_chunk,
-					 list_entry);
-		nvgpu_list_del(&chunk->list_entry);
-
-		if (free_buddy_alloc)
-			nvgpu_free(&a->source_allocator, chunk->base);
-		nvgpu_kmem_cache_free(a->chunk_cache, chunk);
+	if (free_buddy_alloc) {
+		while (sgl) {
+			nvgpu_free(&a->source_allocator, sgl->phys);
+			sgl = nvgpu_mem_sgl_next(sgl);
+		}
 	}
 
+	nvgpu_mem_sgl_free(a->owner->g, alloc->sgl);
 	nvgpu_kmem_cache_free(a->alloc_cache, alloc);
 }
 
@@ -243,15 +240,14 @@ static void free_slab_page(struct nvgpu_page_allocator *a,
 }
 
 /*
- * This expects @alloc to have 1 empty page_alloc_chunk already added to the
- * alloc_chunks list.
+ * This expects @alloc to have 1 empty sgl_entry ready for usage.
  */
 static int __do_slab_alloc(struct nvgpu_page_allocator *a,
 			   struct page_alloc_slab *slab,
 			   struct nvgpu_page_alloc *alloc)
 {
 	struct page_alloc_slab_page *slab_page = NULL;
-	struct page_alloc_chunk *chunk;
+	struct nvgpu_mem_sgl *sgl;
 	unsigned long offs;
 
 	/*
@@ -302,18 +298,19 @@ static int __do_slab_alloc(struct nvgpu_page_allocator *a,
 		BUG(); /* Should be impossible to hit this. */
 
 	/*
-	 * Handle building the nvgpu_page_alloc struct. We expect one
-	 * page_alloc_chunk to be present.
+	 * Handle building the nvgpu_page_alloc struct. We expect one sgl
+	 * to be present.
 	 */
 	alloc->slab_page = slab_page;
 	alloc->nr_chunks = 1;
 	alloc->length = slab_page->slab_size;
 	alloc->base = slab_page->page_addr + (offs * slab_page->slab_size);
 
-	chunk = nvgpu_list_first_entry(&alloc->alloc_chunks,
-				page_alloc_chunk, list_entry);
-	chunk->base = alloc->base;
-	chunk->length = alloc->length;
+	sgl         = alloc->sgl;
+	sgl->phys   = alloc->base;
+	sgl->dma    = alloc->base;
+	sgl->length = alloc->length;
+	sgl->next   = NULL;
 
 	return 0;
 }
@@ -327,7 +324,7 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
 	int err, slab_nr;
 	struct page_alloc_slab *slab;
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
+	struct nvgpu_mem_sgl *sgl = NULL;
 
 	/*
 	 * Align the length to a page and then divide by the page size (4k for
@@ -341,15 +338,13 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
 		palloc_dbg(a, "OOM: could not alloc page_alloc struct!\n");
 		goto fail;
 	}
-	chunk = nvgpu_kmem_cache_alloc(a->chunk_cache);
-	if (!chunk) {
-		palloc_dbg(a, "OOM: could not alloc alloc_chunk struct!\n");
+	sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
+	if (!sgl) {
+		palloc_dbg(a, "OOM: could not alloc sgl struct!\n");
 		goto fail;
 	}
 
-	nvgpu_init_list_node(&alloc->alloc_chunks);
-	nvgpu_list_add(&chunk->list_entry, &alloc->alloc_chunks);
-
+	alloc->sgl = sgl;
 	err = __do_slab_alloc(a, slab, alloc);
 	if (err)
 		goto fail;
@@ -363,8 +358,8 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
 fail:
 	if (alloc)
 		nvgpu_kmem_cache_free(a->alloc_cache, alloc);
-	if (chunk)
-		nvgpu_kmem_cache_free(a->chunk_cache, chunk);
+	if (sgl)
+		nvgpu_kfree(a->owner->g, sgl);
 	return NULL;
 }
 
@@ -426,7 +421,7 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
 	struct nvgpu_page_allocator *a, u64 pages)
 {
 	struct nvgpu_page_alloc *alloc;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl, *prev_sgl = NULL;
 	u64 max_chunk_len = pages << a->page_shift;
 	int i = 0;
 
@@ -436,7 +431,6 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
 
 	memset(alloc, 0, sizeof(*alloc));
 
-	nvgpu_init_list_node(&alloc->alloc_chunks);
 	alloc->length = pages << a->page_shift;
 
 	while (pages) {
@@ -482,36 +476,48 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
 			goto fail_cleanup;
 		}
 
-		c = nvgpu_kmem_cache_alloc(a->chunk_cache);
-		if (!c) {
+		sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
+		if (!sgl) {
 			nvgpu_free(&a->source_allocator, chunk_addr);
 			goto fail_cleanup;
 		}
 
 		pages -= chunk_pages;
 
-		c->base = chunk_addr;
-		c->length = chunk_len;
-		nvgpu_list_add(&c->list_entry, &alloc->alloc_chunks);
+		sgl->phys   = chunk_addr;
+		sgl->dma    = chunk_addr;
+		sgl->length = chunk_len;
+
+		/*
+		 * Build the singly linked list with a head node that is part of
+		 * the list.
+		 */
+		if (prev_sgl)
+			prev_sgl->next = sgl;
+		else
+			alloc->sgl = sgl;
+
+		prev_sgl = sgl;
 
 		i++;
 	}
 
 	alloc->nr_chunks = i;
-	c = nvgpu_list_first_entry(&alloc->alloc_chunks,
-				page_alloc_chunk, list_entry);
-	alloc->base = c->base;
+	alloc->base = alloc->sgl->phys;
 
 	return alloc;
 
 fail_cleanup:
-	while (!nvgpu_list_empty(&alloc->alloc_chunks)) {
-		c = nvgpu_list_first_entry(&alloc->alloc_chunks,
-				     page_alloc_chunk, list_entry);
-		nvgpu_list_del(&c->list_entry);
-		nvgpu_free(&a->source_allocator, c->base);
-		nvgpu_kmem_cache_free(a->chunk_cache, c);
+	sgl = alloc->sgl;
+	while (sgl) {
+		struct nvgpu_mem_sgl *next = sgl->next;
+
+		nvgpu_free(&a->source_allocator, sgl->phys);
+		nvgpu_kfree(a->owner->g, sgl);
+
+		sgl = next;
 	}
+
 	nvgpu_kmem_cache_free(a->alloc_cache, alloc);
 fail:
 	return NULL;
@@ -521,7 +527,7 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages(
 	struct nvgpu_page_allocator *a, u64 len)
 {
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl;
 	u64 pages;
 	int i = 0;
 
@@ -536,11 +542,15 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages(
 
 	palloc_dbg(a, "Alloc 0x%llx (%llu) id=0x%010llx\n",
 		   pages << a->page_shift, pages, alloc->base);
-	nvgpu_list_for_each_entry(c, &alloc->alloc_chunks,
-				  page_alloc_chunk, list_entry) {
+	sgl = alloc->sgl;
+	while (sgl) {
 		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
-			   i++, c->base, c->length);
+			   i++,
+			   nvgpu_mem_sgl_phys(sgl),
+			   nvgpu_mem_sgl_length(sgl));
+		sgl = sgl->next;
 	}
+	palloc_dbg(a, "Alloc done\n");
 
 	return alloc;
 }
@@ -638,11 +648,11 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages_fixed(
 	struct nvgpu_page_allocator *a, u64 base, u64 length, u32 unused)
 {
 	struct nvgpu_page_alloc *alloc;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl;
 
 	alloc = nvgpu_kmem_cache_alloc(a->alloc_cache);
-	c = nvgpu_kmem_cache_alloc(a->chunk_cache);
-	if (!alloc || !c)
+	sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
+	if (!alloc || !sgl)
 		goto fail;
 
 	alloc->base = nvgpu_alloc_fixed(&a->source_allocator, base, length, 0);
@@ -653,17 +663,18 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages_fixed(
 
 	alloc->nr_chunks = 1;
 	alloc->length = length;
-	nvgpu_init_list_node(&alloc->alloc_chunks);
+	alloc->sgl = sgl;
 
-	c->base = alloc->base;
-	c->length = length;
-	nvgpu_list_add(&c->list_entry, &alloc->alloc_chunks);
+	sgl->phys   = alloc->base;
+	sgl->dma    = alloc->base;
+	sgl->length = length;
+	sgl->next   = NULL;
 
 	return alloc;
 
 fail:
-	if (c)
-		nvgpu_kmem_cache_free(a->chunk_cache, c);
+	if (sgl)
+		nvgpu_kfree(a->owner->g, sgl);
 	if (alloc)
 		nvgpu_kmem_cache_free(a->alloc_cache, alloc);
 	return NULL;
@@ -677,7 +688,7 @@ static u64 nvgpu_page_alloc_fixed(struct nvgpu_allocator *__a,
 {
 	struct nvgpu_page_allocator *a = page_allocator(__a);
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl;
 	u64 aligned_len, pages;
 	int i = 0;
 
@@ -697,10 +708,13 @@ static u64 nvgpu_page_alloc_fixed(struct nvgpu_allocator *__a,
 
 	palloc_dbg(a, "Alloc [fixed] @ 0x%010llx + 0x%llx (%llu)\n",
 		   alloc->base, aligned_len, pages);
-	nvgpu_list_for_each_entry(c, &alloc->alloc_chunks,
-				  page_alloc_chunk, list_entry) {
+	sgl = alloc->sgl;
+	while (sgl) {
 		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
-			   i++, c->base, c->length);
+			   i++,
+			   nvgpu_mem_sgl_phys(sgl),
+			   nvgpu_mem_sgl_length(sgl));
+		sgl = sgl->next;
 	}
 
 	a->nr_fixed_allocs++;
@@ -896,11 +910,9 @@ int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
 
 	a->alloc_cache = nvgpu_kmem_cache_create(g,
 					sizeof(struct nvgpu_page_alloc));
-	a->chunk_cache = nvgpu_kmem_cache_create(g,
-					sizeof(struct page_alloc_chunk));
 	a->slab_page_cache = nvgpu_kmem_cache_create(g,
 					sizeof(struct page_alloc_slab_page));
-	if (!a->alloc_cache || !a->chunk_cache || !a->slab_page_cache) {
+	if (!a->alloc_cache || !a->slab_page_cache) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -941,8 +953,6 @@ int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
 fail:
 	if (a->alloc_cache)
 		nvgpu_kmem_cache_destroy(a->alloc_cache);
-	if (a->chunk_cache)
-		nvgpu_kmem_cache_destroy(a->chunk_cache);
 	if (a->slab_page_cache)
 		nvgpu_kmem_cache_destroy(a->slab_page_cache);
 	nvgpu_kfree(g, a);
diff --git a/drivers/gpu/nvgpu/common/pramin.c b/drivers/gpu/nvgpu/common/pramin.c
index 425bfdb4..bb7d930e 100644
--- a/drivers/gpu/nvgpu/common/pramin.c
+++ b/drivers/gpu/nvgpu/common/pramin.c
@@ -84,37 +84,40 @@ void nvgpu_pramin_access_batched(struct gk20a *g, struct nvgpu_mem *mem,
 		u32 offset, u32 size, pramin_access_batch_fn loop, u32 **arg)
 {
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
+	struct nvgpu_mem_sgl *sgl;
 	u32 byteoff, start_reg, until_end, n;
 
 	alloc = get_vidmem_page_alloc(mem->priv.sgt->sgl);
-	nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-			page_alloc_chunk, list_entry) {
-		if (offset >= chunk->length)
-			offset -= chunk->length;
-		else
+	sgl = alloc->sgl;
+	while (sgl) {
+		if (offset >= nvgpu_mem_sgl_length(sgl)) {
+			offset -= nvgpu_mem_sgl_length(sgl);
+			sgl = sgl->next;
+		} else {
 			break;
+		}
 	}
 
 	while (size) {
-		byteoff = g->ops.pramin.enter(g, mem, chunk,
+		u32 sgl_len = (u32)nvgpu_mem_sgl_length(sgl);
+
+		byteoff = g->ops.pramin.enter(g, mem, sgl,
 					      offset / sizeof(u32));
 		start_reg = g->ops.pramin.data032_r(byteoff / sizeof(u32));
 		until_end = SZ_1M - (byteoff & (SZ_1M - 1));
 
-		n = min3(size, until_end, (u32)(chunk->length - offset));
+		n = min3(size, until_end, (u32)(sgl_len - offset));
 
 		loop(g, start_reg, n / sizeof(u32), arg);
 
 		/* read back to synchronize accesses */
 		gk20a_readl(g, start_reg);
-		g->ops.pramin.exit(g, mem, chunk);
+		g->ops.pramin.exit(g, mem, sgl);
 
 		size -= n;
 
-		if (n == (chunk->length - offset)) {
-			chunk = nvgpu_list_next_entry(chunk, page_alloc_chunk,
-					list_entry);
+		if (n == (sgl_len - offset)) {
+			sgl = nvgpu_mem_sgl_next(sgl);
 			offset = 0;
 		} else {
 			offset += n;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 7eee2d51..355228db 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -34,6 +34,7 @@ struct gk20a_debug_output;
 struct nvgpu_clk_pll_debug_data;
 struct nvgpu_nvhost_dev;
 struct nvgpu_cpu_time_correlation_sample;
+struct nvgpu_mem_sgl;
 
 #include <nvgpu/lock.h>
 #include <nvgpu/thread.h>
@@ -70,8 +71,6 @@ struct nvgpu_cpu_time_correlation_sample;
 #endif
 #include "ecc_gk20a.h"
 
-struct page_alloc_chunk;
-
 /* PTIMER_REF_FREQ_HZ corresponds to a period of 32 nanoseconds.
     32 ns is the resolution of ptimer. */
 #define PTIMER_REF_FREQ_HZ                      31250000
@@ -701,7 +700,7 @@ struct gpu_ops {
 		bool (*support_sparse)(struct gk20a *g);
 		u64 (*gmmu_map)(struct vm_gk20a *vm,
 				u64 map_offset,
-				struct sg_table *sgt,
+				struct nvgpu_mem_sgl *sgl,
 				u64 buffer_offset,
 				u64 size,
 				int pgsz_idx,
@@ -761,9 +760,9 @@ struct gpu_ops {
 				size_t size);
 	struct {
 		u32 (*enter)(struct gk20a *g, struct nvgpu_mem *mem,
-			     struct page_alloc_chunk *chunk, u32 w);
+			     struct nvgpu_mem_sgl *sgl, u32 w);
 		void (*exit)(struct gk20a *g, struct nvgpu_mem *mem,
-			     struct page_alloc_chunk *chunk);
+			     struct nvgpu_mem_sgl *sgl);
 		u32 (*data032_r)(u32 i);
 	} pramin;
 	struct {
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 97b7aa80..cd34e769 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1151,7 +1151,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
 	struct gk20a_fence *gk20a_fence_out = NULL;
 	struct gk20a_fence *gk20a_last_fence = NULL;
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
+	struct nvgpu_mem_sgl *sgl = NULL;
 	int err = 0;
 
 	if (g->mm.vidmem.ce_ctx_id == (u32)~0)
@@ -1159,16 +1159,16 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
 
 	alloc = get_vidmem_page_alloc(mem->priv.sgt->sgl);
 
-	nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-				  page_alloc_chunk, list_entry) {
+	sgl = alloc->sgl;
+	while (sgl) {
 		if (gk20a_last_fence)
 			gk20a_fence_put(gk20a_last_fence);
 
 		err = gk20a_ce_execute_ops(g,
 			g->mm.vidmem.ce_ctx_id,
 			0,
-			chunk->base,
-			chunk->length,
+			nvgpu_mem_sgl_phys(sgl),
+			nvgpu_mem_sgl_length(sgl),
 			0x00000000,
 			NVGPU_CE_DST_LOCATION_LOCAL_FB,
 			NVGPU_CE_MEMSET,
@@ -1183,6 +1183,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
 		}
 
 		gk20a_last_fence = gk20a_fence_out;
+		sgl = nvgpu_mem_sgl_next(sgl);
 	}
 
 	if (gk20a_last_fence) {
@@ -1262,10 +1263,10 @@ dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr)
 	return addr;
 }
 
-u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova)
+u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, u64 iova)
 {
 	/* ensure it is not vidmem allocation */
-	WARN_ON(is_vidmem_page_alloc((u64)iova));
+	WARN_ON(is_vidmem_page_alloc(iova));
 
 	if (device_is_iommuable(dev_from_gk20a(g)) &&
 			g->ops.mm.get_physical_addr_bits)
@@ -2167,11 +2168,6 @@ u32 gk20a_mm_get_physical_addr_bits(struct gk20a *g)
 	return 34;
 }
 
-u64 gk20a_mm_gpu_phys_addr(struct gk20a *g, u64 phys, u32 flags)
-{
-	return phys;
-}
-
 const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
 						      u32 big_page_size)
 {
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index c77bebf8..2fdc1729 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -336,7 +336,6 @@ void gk20a_mm_dump_vm(struct vm_gk20a *vm,
 
 int gk20a_mm_suspend(struct gk20a *g);
 
-u64 gk20a_mm_gpu_phys_addr(struct gk20a *g, u64 phys, u32 flags);
 u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova);
 
 void gk20a_mm_ltc_isr(struct gk20a *g);
@@ -361,29 +360,29 @@ static inline phys_addr_t gk20a_mem_phys(struct nvgpu_mem *mem)
 }
 
 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
-			u64 map_offset,
-			struct sg_table *sgt,
-			u64 buffer_offset,
-			u64 size,
-			int pgsz_idx,
-			u8 kind_v,
-			u32 ctag_offset,
-			u32 flags,
-			int rw_flag,
-			bool clear_ctags,
-			bool sparse,
-			bool priv,
-			struct vm_gk20a_mapping_batch *batch,
-			enum nvgpu_aperture aperture);
+			  u64 map_offset,
+			  struct nvgpu_mem_sgl *sgl,
+			  u64 buffer_offset,
+			  u64 size,
+			  int pgsz_idx,
+			  u8 kind_v,
+			  u32 ctag_offset,
+			  u32 flags,
+			  int rw_flag,
+			  bool clear_ctags,
+			  bool sparse,
+			  bool priv,
+			  struct vm_gk20a_mapping_batch *batch,
+			  enum nvgpu_aperture aperture);
 
 void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
-			u64 vaddr,
-			u64 size,
-			int pgsz_idx,
-			bool va_allocated,
-			int rw_flag,
-			bool sparse,
-			struct vm_gk20a_mapping_batch *batch);
+			     u64 vaddr,
+			     u64 size,
+			     int pgsz_idx,
+			     bool va_allocated,
+			     int rw_flag,
+			     bool sparse,
+			     struct vm_gk20a_mapping_batch *batch);
 
 struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf);
 void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
diff --git a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c
index 9d19e9e5..8a34a63c 100644
--- a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c
@@ -26,9 +26,9 @@
 
 /* WARNING: returns pramin_window_lock taken, complement with pramin_exit() */
 u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
-			      struct page_alloc_chunk *chunk, u32 w)
+		       struct nvgpu_mem_sgl *sgl, u32 w)
 {
-	u64 bufbase = chunk->base;
+	u64 bufbase = nvgpu_mem_sgl_phys(sgl);
 	u64 addr = bufbase + w * sizeof(u32);
 	u32 hi = (u32)((addr & ~(u64)0xfffff)
 		>> bus_bar0_window_target_bar0_window_base_shift_v());
@@ -40,8 +40,9 @@ u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
 
 	gk20a_dbg(gpu_dbg_mem,
 			"0x%08x:%08x begin for %p,%p at [%llx,%llx] (sz %llx)",
-			hi, lo, mem, chunk, bufbase,
-			bufbase + chunk->length, chunk->length);
+			hi, lo, mem, sgl, bufbase,
+			bufbase + nvgpu_mem_sgl_phys(sgl),
+			nvgpu_mem_sgl_length(sgl));
 
 	WARN_ON(!bufbase);
 
@@ -57,9 +58,9 @@ u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
 }
 
 void gk20a_pramin_exit(struct gk20a *g, struct nvgpu_mem *mem,
-			      struct page_alloc_chunk *chunk)
+		       struct nvgpu_mem_sgl *sgl)
 {
-	gk20a_dbg(gpu_dbg_mem, "end for %p,%p", mem, chunk);
+	gk20a_dbg(gpu_dbg_mem, "end for %p,%p", mem, sgl);
 
 	nvgpu_spinlock_release(&g->mm.pramin_window_lock);
 }
diff --git a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h
index 1a1ac871..fc5ba919 100644
--- a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h
@@ -19,10 +19,10 @@
 
 struct gk20a;
 struct nvgpu_mem;
-struct page_alloc_chunk;
+struct nvgpu_mem_sgl;
 
 u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
-			      struct page_alloc_chunk *chunk, u32 w);
+		       struct nvgpu_mem_sgl *sgl, u32 w);
 void gk20a_pramin_exit(struct gk20a *g, struct nvgpu_mem *mem,
-			      struct page_alloc_chunk *chunk);
+		       struct nvgpu_mem_sgl *sgl);
 #endif
diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
index fc27b120..c276f5a6 100644
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
@@ -904,7 +904,7 @@ int gr_gp10b_alloc_buffer(struct vm_gk20a *vm, size_t size,
 
 	mem->gpu_va = nvgpu_gmmu_map(vm,
 				mem,
-				size,
+				mem->aligned_size,
 				NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
 				gk20a_mem_flag_none,
 				false,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index de129a5f..11060300 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -27,8 +27,6 @@
 #include <nvgpu/gmmu_t19x.h>
 #endif
 
-struct scatterlist;
-
 /*
  * This is the GMMU API visible to blocks outside of the GMMU. Basically this
  * API supports all the different types of mappings that might be done in the
diff --git a/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h
index e2d4d336..f96c2801 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h
@@ -32,6 +32,8 @@ struct nvgpu_mem_priv {
 };
 
 u64 nvgpu_mem_get_addr_sgl(struct gk20a *g, struct scatterlist *sgl);
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create(struct gk20a *g,
+					   struct sg_table *sgt);
 
 /**
  * __nvgpu_mem_create_from_pages - Create an nvgpu_mem from physical pages.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h
index 4cac3e70..cfce8c5b 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/log.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/log.h
@@ -71,6 +71,7 @@ enum nvgpu_log_categories {
 	gpu_dbg_pd_cache   = BIT(20),	/* PD cache traces. */
 	gpu_dbg_alloc      = BIT(21),	/* Allocator debugging. */
 	gpu_dbg_dma        = BIT(22),	/* DMA allocation prints. */
+	gpu_dbg_sgl        = BIT(23),	/* SGL related traces. */
 	gpu_dbg_mem        = BIT(31),	/* memory accesses; very verbose. */
 };
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
index a112623e..7d19cf81 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
@@ -33,6 +33,8 @@ struct gk20a;
 struct nvgpu_allocator;
 struct nvgpu_gmmu_attrs;
 
+#define NVGPU_MEM_DMA_ERROR		(~0ULL)
+
 /*
  * Real location of a buffer - nvgpu_aperture_mask() will deduce what will be
  * told to the gpu about the aperture, but this flag designates where the
@@ -44,6 +46,28 @@ enum nvgpu_aperture {
 	APERTURE_VIDMEM
 };
 
+/*
+ * This struct holds the necessary information for describing a struct
+ * nvgpu_mem's scatter gather list.
+ *
+ * These are created in a platform dependent way. As a result the function
+ * definition for allocating these lives in the <nvgpu/_OS_/nvgpu_mem.h> file.
+ */
+struct nvgpu_mem_sgl {
+	/*
+	 * Internally this is implemented as a singly linked list.
+	 */
+	struct nvgpu_mem_sgl	*next;
+
+	/*
+	 * There is both a phys address and a DMA address since some systems,
+	 * for example ones with an IOMMU, may see these as different addresses.
+	 */
+	u64			 phys;
+	u64			 dma;
+	u64			 length;
+};
+
 struct nvgpu_mem {
 	/*
 	 * Populated for all nvgpu_mem structs - vidmem or system.
@@ -176,6 +200,27 @@ int nvgpu_mem_create_from_mem(struct gk20a *g,
 			      struct nvgpu_mem *dest, struct nvgpu_mem *src,
 			      int start_page, int nr_pages);
 
+/**
+ * nvgpu_mem_sgl_create_from_mem - Create a scatter list from an nvgpu_mem.
+ *
+ * @g   - The GPU.
+ * @mem - The source memory allocation to use.
+ *
+ * Create a scatter gather list from the passed @mem struct. This list lets the
+ * calling code iterate across each chunk of a DMA allocation for when that DMA
+ * allocation is not completely contiguous.
+ */
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create_from_mem(struct gk20a *g,
+						    struct nvgpu_mem *mem);
+void nvgpu_mem_sgl_free(struct gk20a *g, struct nvgpu_mem_sgl *sgl);
+
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_next(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_phys(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_dma(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_length(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_gpu_addr(struct gk20a *g, struct nvgpu_mem_sgl *sgl,
+			   struct nvgpu_gmmu_attrs *attrs);
+
 /*
  * Buffer accessors - wrap between begin() and end() if there is no permanent
  * kernel mapping for this buffer.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h b/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h
index 9a5ef8d3..de83ca7f 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h
@@ -18,6 +18,7 @@
 #define PAGE_ALLOCATOR_PRIV_H
 
 #include <nvgpu/allocator.h>
+#include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/kmem.h>
 #include <nvgpu/list.h>
 #include <nvgpu/rbtree.h>
@@ -83,27 +84,17 @@ page_alloc_slab_page_from_list_entry(struct nvgpu_list_node *node)
 	((uintptr_t)node - offsetof(struct page_alloc_slab_page, list_entry));
 };
 
-struct page_alloc_chunk {
-	struct nvgpu_list_node list_entry;
-
-	u64 base;
-	u64 length;
-};
-
-static inline struct page_alloc_chunk *
-page_alloc_chunk_from_list_entry(struct nvgpu_list_node *node)
-{
-	return (struct page_alloc_chunk *)
-	((uintptr_t)node - offsetof(struct page_alloc_chunk, list_entry));
-};
-
 /*
  * Struct to handle internal management of page allocation. It holds a list
  * of the chunks of pages that make up the overall allocation - much like a
  * scatter gather table.
  */
 struct nvgpu_page_alloc {
-	struct nvgpu_list_node alloc_chunks;
+	/*
+	 * nvgpu_mem_sgl for describing the actual allocation. Convenient for
+	 * GMMU mapping.
+	 */
+	struct nvgpu_mem_sgl *sgl;
 
 	int nr_chunks;
 	u64 length;
@@ -156,7 +147,6 @@ struct nvgpu_page_allocator {
 	int nr_slabs;
 
 	struct nvgpu_kmem_cache *alloc_cache;
-	struct nvgpu_kmem_cache *chunk_cache;
 	struct nvgpu_kmem_cache *slab_page_cache;
 
 	u64 flags;
diff --git a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c
index 85c436e5..ee9b791a 100644
--- a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c
+++ b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c
@@ -13,7 +13,6 @@
  * more details.
  */
 
-#include <linux/dma-mapping.h>
 #include "vgpu/vgpu.h"
 #include "vgpu_mm_gp10b.h"
 #include "gk20a/mm_gk20a.h"
@@ -41,7 +40,7 @@ static inline int add_mem_desc(struct tegra_vgpu_mem_desc *mem_desc,
 
 static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
 				u64 map_offset,
-				struct sg_table *sgt,
+				struct nvgpu_mem_sgl *sgl,
 				u64 buffer_offset,
 				u64 size,
 				int pgsz_idx,
@@ -61,10 +60,9 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
 	struct tegra_vgpu_as_map_ex_params *p = &msg.params.as_map_ex;
 	struct tegra_vgpu_mem_desc *mem_desc;
 	u32 page_size  = vm->gmmu_page_sizes[pgsz_idx];
+	u64 buffer_size = PAGE_ALIGN(size);
 	u64 space_to_skip = buffer_offset;
-	u64 buffer_size = 0;
 	u32 mem_desc_count = 0, i;
-	struct scatterlist *sgl;
 	void *handle = NULL;
 	size_t oob_size;
 	u8 prot;
@@ -73,7 +71,7 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
 
 	/* FIXME: add support for sparse mappings */
 
-	if (WARN_ON(!sgt) || WARN_ON(!g->mm.bypass_smmu))
+	if (WARN_ON(!sgl) || WARN_ON(!g->mm.bypass_smmu))
 		return 0;
 
 	if (space_to_skip & (page_size - 1))
@@ -100,33 +98,36 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
 		goto fail;
 	}
 
-	sgl = sgt->sgl;
-	while (space_to_skip && sgl &&
-		(space_to_skip + page_size > sgl->length)) {
-		space_to_skip -= sgl->length;
-		sgl = sg_next(sgl);
-	}
-	WARN_ON(!sgl);
+	while (sgl) {
+		u64 phys_addr;
+		u64 chunk_length;
+
+		/*
+		 * Cut out sgl ents for space_to_skip.
+		 */
+		if (space_to_skip &&
+		    space_to_skip >= nvgpu_mem_sgl_length(sgl)) {
+			space_to_skip -= nvgpu_mem_sgl_length(sgl);
+			sgl = nvgpu_mem_sgl_next(sgl);
+			continue;
+		}
 
-	if (add_mem_desc(&mem_desc[mem_desc_count++],
-			sg_phys(sgl) + space_to_skip,
-			sgl->length - space_to_skip,
-			&oob_size)) {
-		err = -ENOMEM;
-		goto fail;
-	}
-	buffer_size += sgl->length - space_to_skip;
+		phys_addr = nvgpu_mem_sgl_phys(sgl) + space_to_skip;
+		chunk_length = min(size,
+				   nvgpu_mem_sgl_length(sgl) - space_to_skip);
 
-	sgl = sg_next(sgl);
-	while (sgl && buffer_size < size) {
-		if (add_mem_desc(&mem_desc[mem_desc_count++], sg_phys(sgl),
-				sgl->length, &oob_size)) {
+		if (add_mem_desc(&mem_desc[mem_desc_count++], phys_addr,
+				 chunk_length, &oob_size)) {
 			err = -ENOMEM;
 			goto fail;
 		}
 
-		buffer_size += sgl->length;
-		sgl = sg_next(sgl);
+		space_to_skip = 0;
+		size -= chunk_length;
+		sgl   = nvgpu_mem_sgl_next(sgl);
+
+		if (size == 0)
+			break;
 	}
 
 	if (rw_flag == gk20a_mem_flag_read_only)
@@ -153,7 +154,7 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
 	msg.handle = vgpu_get_handle(g);
 	p->handle = vm->handle;
 	p->gpu_va = map_offset;
-	p->size = size;
+	p->size = buffer_size;
 	p->mem_desc_count = mem_desc_count;
 	p->pgsz_idx = pgsz_idx;
 	p->iova = 0;
diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
index ef9e00c8..5da6f158 100644
--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -78,7 +78,7 @@ int vgpu_init_mm_support(struct gk20a *g)
 
 static u64 vgpu_locked_gmmu_map(struct vm_gk20a *vm,
 				u64 map_offset,
-				struct sg_table *sgt,
+				struct nvgpu_mem_sgl *sgl,
 				u64 buffer_offset,
 				u64 size,
 				int pgsz_idx,
@@ -98,7 +98,7 @@ static u64 vgpu_locked_gmmu_map(struct vm_gk20a *vm,
 	struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
 	struct tegra_vgpu_cmd_msg msg;
 	struct tegra_vgpu_as_map_params *p = &msg.params.as_map;
-	u64 addr = nvgpu_mem_get_addr_sgl(g, sgt->sgl);
+	u64 addr = nvgpu_mem_sgl_gpu_addr(g, sgl, NULL);
 	u8 prot;
 
 	gk20a_dbg_fn("");
-- 
cgit v1.2.2