From 0090ee5aca268a3c359f34c74b8c521df3bd8593 Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Thu, 25 May 2017 16:56:50 -0700
Subject: gpu: nvgpu: nvgpu SGL implementation

The last major item preventing the core MM code in the nvgpu
driver from being platform agnostic is the usage of Linux
scattergather tables and scattergather lists. These data
structures are used throughout the mapping code to handle
discontiguous DMA allocations and also overloaded to represent
VIDMEM allocs.

The notion of a scatter gather table is crucial to a HW device
that can handle discontiguous DMA. The GPU has a MMU which
allows the GPU to do page gathering and present a virtually
contiguous buffer to the GPU HW. As a result it makes sense
for the GPU driver to use some sort of scatter gather concept
so maximize memory usage efficiency.

To that end this patch keeps the notion of a scatter gather
list but implements it in the nvgpu common code. It is based
heavily on the Linux SGL concept. It is a singly linked list
of blocks - each representing a chunk of memory. To map or
use a DMA allocation SW must iterate over each block in the
SGL.

This patch implements the most basic level of support for this
data structure. There are certainly easy optimizations that
could be done to speed up the current implementation. However,
this patches' goal is to simply divest the core MM code from
any last Linux'isms. Speed and efficiency come next.

Change-Id: Icf44641db22d87fa1d003debbd9f71b605258e42
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1530867
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/linux/nvgpu_mem.c   | 114 +++++++++++++++++++++
 drivers/gpu/nvgpu/common/linux/vm.c          |  25 +++--
 drivers/gpu/nvgpu/common/mm/gmmu.c           | 109 +++++++++-----------
 drivers/gpu/nvgpu/common/mm/nvgpu_mem.c      |  73 ++++++++++++++
 drivers/gpu/nvgpu/common/mm/page_allocator.c | 142 ++++++++++++++-------------
 drivers/gpu/nvgpu/common/pramin.c            |  27 ++---
 6 files changed, 345 insertions(+), 145 deletions(-)
 create mode 100644 drivers/gpu/nvgpu/common/mm/nvgpu_mem.c

(limited to 'drivers/gpu/nvgpu/common')

diff --git a/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c b/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
index e4991d0d..eb54f3fd 100644
--- a/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
+++ b/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
@@ -21,6 +21,7 @@
 #include <nvgpu/log.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/kmem.h>
 
 #include <nvgpu/linux/dma.h>
 
@@ -395,3 +396,116 @@ int __nvgpu_mem_create_from_pages(struct gk20a *g, struct nvgpu_mem *dest,
 
 	return 0;
 }
+
+static struct nvgpu_mem_sgl *__nvgpu_mem_sgl_dup(struct gk20a *g,
+						 struct nvgpu_mem_sgl *sgl)
+{
+	struct nvgpu_mem_sgl *head, *next;
+
+	head = nvgpu_kzalloc(g, sizeof(*sgl));
+	if (!head)
+		return NULL;
+
+	next = head;
+	while (true) {
+		nvgpu_log(g, gpu_dbg_sgl,
+			  "  phys: 0x%-12llx dma: 0x%-12llx len: 0x%llx",
+			  sgl->phys, sgl->dma, sgl->length);
+
+		next->dma    = sgl->dma;
+		next->phys   = sgl->phys;
+		next->length = sgl->length;
+		next->next   = NULL;
+
+		sgl = nvgpu_mem_sgl_next(sgl);
+		if (!sgl)
+			break;
+
+		next->next = nvgpu_kzalloc(g, sizeof(*sgl));
+		if (!next->next) {
+			nvgpu_mem_sgl_free(g, head);
+			return NULL;
+		}
+		next = next->next;
+	}
+
+	return head;
+}
+
+static struct nvgpu_mem_sgl *__nvgpu_mem_sgl_create_from_vidmem(
+	struct gk20a *g,
+	struct scatterlist *linux_sgl)
+{
+	struct nvgpu_page_alloc *vidmem_alloc;
+
+	vidmem_alloc = get_vidmem_page_alloc(linux_sgl);
+	if (!vidmem_alloc)
+		return NULL;
+
+	nvgpu_log(g, gpu_dbg_sgl, "Vidmem sgl:");
+
+	return __nvgpu_mem_sgl_dup(g, vidmem_alloc->sgl);
+}
+
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create(struct gk20a *g,
+					   struct sg_table *sgt)
+{
+	struct nvgpu_mem_sgl *head, *sgl, *next;
+	struct scatterlist *linux_sgl = sgt->sgl;
+
+	if (is_vidmem_page_alloc(sg_dma_address(linux_sgl)))
+		return __nvgpu_mem_sgl_create_from_vidmem(g, linux_sgl);
+
+	head = nvgpu_kzalloc(g, sizeof(*sgl));
+	if (!head)
+		return NULL;
+
+	nvgpu_log(g, gpu_dbg_sgl, "Making sgl:");
+
+	sgl = head;
+	while (true) {
+		sgl->dma    = sg_dma_address(linux_sgl);
+		sgl->phys   = sg_phys(linux_sgl);
+		sgl->length = linux_sgl->length;
+
+		/*
+		 * We don't like offsets in the pages here. This will cause
+		 * problems.
+		 */
+		if (WARN_ON(linux_sgl->offset)) {
+			nvgpu_mem_sgl_free(g, head);
+			return NULL;
+		}
+
+		nvgpu_log(g, gpu_dbg_sgl,
+			  "  phys: 0x%-12llx dma: 0x%-12llx len: 0x%llx",
+			  sgl->phys, sgl->dma, sgl->length);
+
+		/*
+		 * When there's no more SGL ents for the Linux SGL we are
+		 * done. Don't bother making any more SGL ents for the nvgpu
+		 * SGL.
+		 */
+		linux_sgl = sg_next(linux_sgl);
+		if (!linux_sgl)
+			break;
+
+		next = nvgpu_kzalloc(g, sizeof(*sgl));
+		if (!next) {
+			nvgpu_mem_sgl_free(g, head);
+			return NULL;
+		}
+
+		sgl->next = next;
+		sgl = next;
+	}
+
+	nvgpu_log(g, gpu_dbg_sgl, "Done!");
+	return head;
+}
+
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create_from_mem(struct gk20a *g,
+						    struct nvgpu_mem *mem)
+{
+	return nvgpu_mem_sgl_create(g, mem->priv.sgt);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/vm.c b/drivers/gpu/nvgpu/common/linux/vm.c
index 86d8bec9..4a4429dc 100644
--- a/drivers/gpu/nvgpu/common/linux/vm.c
+++ b/drivers/gpu/nvgpu/common/linux/vm.c
@@ -21,8 +21,11 @@
 #include <nvgpu/lock.h>
 #include <nvgpu/rbtree.h>
 #include <nvgpu/vm_area.h>
+#include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/page_allocator.h>
 
+#include <nvgpu/linux/nvgpu_mem.h>
+
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
 #include "gk20a/kind_gk20a.h"
@@ -66,17 +69,19 @@ static u64 nvgpu_get_buffer_alignment(struct gk20a *g, struct scatterlist *sgl,
 
 	if (aperture == APERTURE_VIDMEM) {
 		struct nvgpu_page_alloc *alloc = get_vidmem_page_alloc(sgl);
-		struct page_alloc_chunk *chunk = NULL;
+		struct nvgpu_mem_sgl *sgl_vid = alloc->sgl;
 
-		nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-					page_alloc_chunk, list_entry) {
-			chunk_align = 1ULL << __ffs(chunk->base |
-						    chunk->length);
+		while (sgl_vid) {
+			chunk_align = 1ULL <<
+				__ffs(nvgpu_mem_sgl_phys(sgl_vid) |
+				nvgpu_mem_sgl_length(sgl_vid));
 
 			if (align)
 				align = min(align, chunk_align);
 			else
 				align = chunk_align;
+
+			sgl_vid = nvgpu_mem_sgl_next(sgl_vid);
 		}
 
 		return align;
@@ -237,6 +242,7 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 	struct nvgpu_vm_area *vm_area = NULL;
 	u32 ctag_offset;
 	enum nvgpu_aperture aperture;
+	struct nvgpu_mem_sgl *nvgpu_sgl;
 
 	/*
 	 * The kind used as part of the key for map caching. HW may
@@ -393,9 +399,12 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 		ctag_offset += buffer_offset >>
 			       ilog2(g->ops.fb.compression_page_size(g));
 
+	nvgpu_sgl = nvgpu_mem_sgl_create(g, bfr.sgt);
+
 	/* update gmmu ptes */
-	map_offset = g->ops.mm.gmmu_map(vm, map_offset,
-					bfr.sgt,
+	map_offset = g->ops.mm.gmmu_map(vm,
+					map_offset,
+					nvgpu_sgl,
 					buffer_offset, /* sg offset */
 					mapping_size,
 					bfr.pgsz_idx,
@@ -410,6 +419,8 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 	if (!map_offset)
 		goto clean_up;
 
+	nvgpu_mem_sgl_free(g, nvgpu_sgl);
+
 	mapped_buffer = nvgpu_kzalloc(g, sizeof(*mapped_buffer));
 	if (!mapped_buffer) {
 		nvgpu_warn(g, "oom allocating tracking buffer");
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index 7f486d68..41f5acdd 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -65,11 +65,14 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
 	struct gk20a *g = gk20a_from_vm(vm);
 	u64 vaddr;
 
-	struct sg_table *sgt = mem->priv.sgt;
+	struct nvgpu_mem_sgl *sgl = nvgpu_mem_sgl_create_from_mem(g, mem);
+
+	if (!sgl)
+		return -ENOMEM;
 
 	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
 	vaddr = g->ops.mm.gmmu_map(vm, addr,
-				   sgt,    /* sg table */
+				   sgl,    /* sg list */
 				   0,      /* sg offset */
 				   size,
 				   gmmu_page_size_kernel,
@@ -82,8 +85,11 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
 				   NULL,   /* mapping_batch handle */
 				   aperture);
 	nvgpu_mutex_release(&vm->update_gmmu_lock);
+
+	nvgpu_mem_sgl_free(g, sgl);
+
 	if (!vaddr) {
-		nvgpu_err(g, "failed to allocate va space");
+		nvgpu_err(g, "failed to map buffer!");
 		return 0;
 	}
 
@@ -91,7 +97,7 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
 }
 
 /*
- * Convenience wrapper over __nvgpu_gmmu_map() for non-fixed mappings.
+ * Map a nvgpu_mem into the GMMU. This is for kernel space to use.
  */
 u64 nvgpu_gmmu_map(struct vm_gk20a *vm,
 		   struct nvgpu_mem *mem,
@@ -106,7 +112,7 @@ u64 nvgpu_gmmu_map(struct vm_gk20a *vm,
 }
 
 /*
- * Like nvgpu_gmmu_map() except it can work on a fixed address instead.
+ * Like nvgpu_gmmu_map() except this can work on a fixed address.
  */
 u64 nvgpu_gmmu_map_fixed(struct vm_gk20a *vm,
 			 struct nvgpu_mem *mem,
@@ -407,7 +413,7 @@ static int __set_pd_level(struct vm_gk20a *vm,
 		 */
 		target_addr = next_pd ?
 			nvgpu_pde_phys_addr(g, next_pd) :
-			g->ops.mm.gpu_phys_addr(g, attrs, phys_addr);
+			phys_addr;
 
 		l->update_entry(vm, l,
 				pd, pd_idx,
@@ -458,18 +464,16 @@ static int __set_pd_level(struct vm_gk20a *vm,
  * VIDMEM version of the update_ptes logic.
  */
 static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
-						 struct sg_table *sgt,
+						 struct nvgpu_mem_sgl *sgl,
 						 u64 space_to_skip,
 						 u64 virt_addr,
 						 u64 length,
 						 struct nvgpu_gmmu_attrs *attrs)
 {
-	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
 	u64 phys_addr, chunk_length;
 	int err = 0;
 
-	if (!sgt) {
+	if (!sgl) {
 		/*
 		 * This is considered an unmap. Just pass in 0 as the physical
 		 * address for the entire GPU range.
@@ -482,22 +486,21 @@ static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
 		return err;
 	}
 
-	alloc = get_vidmem_page_alloc(sgt->sgl);
-
 	/*
 	 * Otherwise iterate across all the chunks in this allocation and
 	 * map them.
 	 */
-	nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-				  page_alloc_chunk, list_entry) {
+	while (sgl) {
 		if (space_to_skip &&
-		    space_to_skip >= chunk->length) {
-			space_to_skip -= chunk->length;
+		    space_to_skip >= nvgpu_mem_sgl_length(sgl)) {
+			space_to_skip -= nvgpu_mem_sgl_length(sgl);
+			sgl = nvgpu_mem_sgl_next(sgl);
 			continue;
 		}
 
-		phys_addr = chunk->base + space_to_skip;
-		chunk_length = min(length, (chunk->length - space_to_skip));
+		phys_addr = nvgpu_mem_sgl_phys(sgl) + space_to_skip;
+		chunk_length = min(length, (nvgpu_mem_sgl_length(sgl) -
+					    space_to_skip));
 
 		err = __set_pd_level(vm, &vm->pdb,
 				     0,
@@ -518,23 +521,24 @@ static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
 
 		if (length == 0)
 			break;
+
+		sgl = nvgpu_mem_sgl_next(sgl);
 	}
 
 	return err;
 }
 
 static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
-						 struct sg_table *sgt,
+						 struct nvgpu_mem_sgl *sgl,
 						 u64 space_to_skip,
 						 u64 virt_addr,
 						 u64 length,
 						 struct nvgpu_gmmu_attrs *attrs)
 {
 	int err;
-	struct scatterlist *sgl;
 	struct gk20a *g = gk20a_from_vm(vm);
 
-	if (!sgt) {
+	if (!sgl) {
 		/*
 		 * This is considered an unmap. Just pass in 0 as the physical
 		 * address for the entire GPU range.
@@ -548,19 +552,15 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
 	}
 
 	/*
-	 * At this point we have a Linux scatter-gather list pointing to some
-	 * number of discontiguous chunks of memory. Iterate over that list and
+	 * At this point we have a scatter-gather list pointing to some number
+	 * of discontiguous chunks of memory. We must iterate over that list and
 	 * generate a GMMU map call for each chunk. There are two possibilities:
-	 * either the IOMMU is enabled or not. When the IOMMU is enabled the
+	 * either an IOMMU is enabled or not. When an IOMMU is enabled the
 	 * mapping is simple since the "physical" address is actually a virtual
-	 * IO address and will be contiguous. The no-IOMMU case is more
-	 * complicated. We will have to iterate over the SGT and do a separate
-	 * map for each chunk of the SGT.
+	 * IO address and will be contiguous.
 	 */
-	sgl = sgt->sgl;
-
 	if (!g->mm.bypass_smmu) {
-		u64 io_addr = nvgpu_mem_get_addr_sgl(g, sgl);
+		u64 io_addr = nvgpu_mem_sgl_gpu_addr(g, sgl, attrs);
 
 		io_addr += space_to_skip;
 
@@ -585,14 +585,16 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
 		/*
 		 * Cut out sgl ents for space_to_skip.
 		 */
-		if (space_to_skip && space_to_skip >= sgl->length) {
-			space_to_skip -= sgl->length;
-			sgl = sg_next(sgl);
+		if (space_to_skip &&
+		    space_to_skip >= nvgpu_mem_sgl_length(sgl)) {
+			space_to_skip -= nvgpu_mem_sgl_length(sgl);
+			sgl = nvgpu_mem_sgl_next(sgl);
 			continue;
 		}
 
-		phys_addr = sg_phys(sgl) + space_to_skip;
-		chunk_length = min(length, sgl->length - space_to_skip);
+		phys_addr = nvgpu_mem_sgl_phys(sgl) + space_to_skip;
+		chunk_length = min(length,
+				   nvgpu_mem_sgl_length(sgl) - space_to_skip);
 
 		err = __set_pd_level(vm, &vm->pdb,
 				     0,
@@ -600,13 +602,11 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
 				     virt_addr,
 				     chunk_length,
 				     attrs);
-		if (err)
-			return err;
 
 		space_to_skip = 0;
 		virt_addr += chunk_length;
 		length    -= chunk_length;
-		sgl        = sg_next(sgl);
+		sgl        = nvgpu_mem_sgl_next(sgl);
 
 		if (length == 0)
 			break;
@@ -624,22 +624,20 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
  * implementations. But the logic around that is generic to all chips. Every
  * chip has some number of PDE levels and then a PTE level.
  *
- * Each chunk of the incoming SGT is sent to the chip specific implementation
+ * Each chunk of the incoming SGL is sent to the chip specific implementation
  * of page table update.
  *
  * [*] Note: the "physical" address may actually be an IO virtual address in the
  *     case of SMMU usage.
  */
 static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
-					  struct sg_table *sgt,
+					  struct nvgpu_mem_sgl *sgl,
 					  u64 space_to_skip,
 					  u64 virt_addr,
 					  u64 length,
 					  struct nvgpu_gmmu_attrs *attrs)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
-	struct nvgpu_page_alloc *alloc;
-	u64 phys_addr = 0;
 	u32 page_size;
 	int err;
 
@@ -665,25 +663,16 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
 		return err;
 	}
 
-	if (sgt) {
-		if (attrs->aperture == APERTURE_VIDMEM) {
-			alloc = get_vidmem_page_alloc(sgt->sgl);
-
-			phys_addr = alloc->base;
-		} else
-			phys_addr = nvgpu_mem_get_addr_sgl(g, sgt->sgl);
-	}
-
 	__gmmu_dbg(g, attrs,
 		   "vm=%s "
 		   "%-5s GPU virt %#-12llx +%#-9llx    phys %#-12llx "
 		   "phys offset: %#-4llx;  pgsz: %3dkb perm=%-2s | "
 		   "kind=%#02x APT=%-6s %c%c%c%c%c",
 		   vm->name,
-		   sgt ? "MAP" : "UNMAP",
+		   sgl ? "MAP" : "UNMAP",
 		   virt_addr,
 		   length,
-		   phys_addr,
+		   sgl ? nvgpu_mem_sgl_phys(sgl) : 0,
 		   space_to_skip,
 		   page_size >> 10,
 		   nvgpu_gmmu_perm_str(attrs->rw_flag),
@@ -696,19 +685,19 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
 		   attrs->valid     ? 'V' : '-');
 
 	/*
-	 * Handle VIDMEM progamming. Currently uses a different scatter list
-	 * format.
+	 * For historical reasons these are separate, but soon these will be
+	 * unified.
 	 */
 	if (attrs->aperture == APERTURE_VIDMEM)
 		err = __nvgpu_gmmu_update_page_table_vidmem(vm,
-							    sgt,
+							    sgl,
 							    space_to_skip,
 							    virt_addr,
 							    length,
 							    attrs);
 	else
 		err = __nvgpu_gmmu_update_page_table_sysmem(vm,
-							    sgt,
+							    sgl,
 							    space_to_skip,
 							    virt_addr,
 							    length,
@@ -717,7 +706,7 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
 	unmap_gmmu_pages(g, &vm->pdb);
 	nvgpu_smp_mb();
 
-	__gmmu_dbg(g, attrs, "%-5s Done!", sgt ? "MAP" : "UNMAP");
+	__gmmu_dbg(g, attrs, "%-5s Done!", sgl ? "MAP" : "UNMAP");
 
 	return err;
 }
@@ -736,7 +725,7 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
  */
 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 			  u64 vaddr,
-			  struct sg_table *sgt,
+			  struct nvgpu_mem_sgl *sgl,
 			  u64 buffer_offset,
 			  u64 size,
 			  int pgsz_idx,
@@ -785,7 +774,7 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 		allocated = true;
 	}
 
-	err = __nvgpu_gmmu_update_page_table(vm, sgt, buffer_offset,
+	err = __nvgpu_gmmu_update_page_table(vm, sgl, buffer_offset,
 					     vaddr, size, &attrs);
 	if (err) {
 		nvgpu_err(g, "failed to update ptes on map");
diff --git a/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c b/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c
new file mode 100644
index 00000000..7296c673
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <nvgpu/kmem.h>
+#include <nvgpu/nvgpu_mem.h>
+
+#include "gk20a/gk20a.h"
+
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_next(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->next;
+}
+
+u64 nvgpu_mem_sgl_phys(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->phys;
+}
+
+u64 nvgpu_mem_sgl_dma(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->dma;
+}
+
+u64 nvgpu_mem_sgl_length(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->length;
+}
+
+/*
+ * This builds a GPU address for the %sgl based on whether an IOMMU is present
+ * or not. It also handles turning the physical address into the true GPU
+ * physical address that should be programmed into the page tables.
+ */
+u64 nvgpu_mem_sgl_gpu_addr(struct gk20a *g, struct nvgpu_mem_sgl *sgl,
+			   struct nvgpu_gmmu_attrs *attrs)
+{
+	if (nvgpu_mem_sgl_dma(sgl) == 0)
+		return g->ops.mm.gpu_phys_addr(g, attrs,
+					       nvgpu_mem_sgl_phys(sgl));
+
+	if (nvgpu_mem_sgl_dma(sgl) == DMA_ERROR_CODE)
+		return 0;
+
+	return gk20a_mm_smmu_vaddr_translate(g, nvgpu_mem_sgl_dma(sgl));
+}
+
+void nvgpu_mem_sgl_free(struct gk20a *g, struct nvgpu_mem_sgl *sgl)
+{
+	struct nvgpu_mem_sgl *next;
+
+	/*
+	 * Free each of the elements. We expect each element to have been
+	 * nvgpu_k[mz]alloc()ed.
+	 */
+	while (sgl) {
+		next = nvgpu_mem_sgl_next(sgl);
+		nvgpu_kfree(g, sgl);
+		sgl = next;
+	}
+}
diff --git a/drivers/gpu/nvgpu/common/mm/page_allocator.c b/drivers/gpu/nvgpu/common/mm/page_allocator.c
index 72ff8f2d..6d92b457 100644
--- a/drivers/gpu/nvgpu/common/mm/page_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/page_allocator.c
@@ -147,19 +147,16 @@ static void __nvgpu_free_pages(struct nvgpu_page_allocator *a,
 			       struct nvgpu_page_alloc *alloc,
 			       bool free_buddy_alloc)
 {
-	struct page_alloc_chunk *chunk;
+	struct nvgpu_mem_sgl *sgl = alloc->sgl;
 
-	while (!nvgpu_list_empty(&alloc->alloc_chunks)) {
-		chunk = nvgpu_list_first_entry(&alloc->alloc_chunks,
-					 page_alloc_chunk,
-					 list_entry);
-		nvgpu_list_del(&chunk->list_entry);
-
-		if (free_buddy_alloc)
-			nvgpu_free(&a->source_allocator, chunk->base);
-		nvgpu_kmem_cache_free(a->chunk_cache, chunk);
+	if (free_buddy_alloc) {
+		while (sgl) {
+			nvgpu_free(&a->source_allocator, sgl->phys);
+			sgl = nvgpu_mem_sgl_next(sgl);
+		}
 	}
 
+	nvgpu_mem_sgl_free(a->owner->g, alloc->sgl);
 	nvgpu_kmem_cache_free(a->alloc_cache, alloc);
 }
 
@@ -243,15 +240,14 @@ static void free_slab_page(struct nvgpu_page_allocator *a,
 }
 
 /*
- * This expects @alloc to have 1 empty page_alloc_chunk already added to the
- * alloc_chunks list.
+ * This expects @alloc to have 1 empty sgl_entry ready for usage.
  */
 static int __do_slab_alloc(struct nvgpu_page_allocator *a,
 			   struct page_alloc_slab *slab,
 			   struct nvgpu_page_alloc *alloc)
 {
 	struct page_alloc_slab_page *slab_page = NULL;
-	struct page_alloc_chunk *chunk;
+	struct nvgpu_mem_sgl *sgl;
 	unsigned long offs;
 
 	/*
@@ -302,18 +298,19 @@ static int __do_slab_alloc(struct nvgpu_page_allocator *a,
 		BUG(); /* Should be impossible to hit this. */
 
 	/*
-	 * Handle building the nvgpu_page_alloc struct. We expect one
-	 * page_alloc_chunk to be present.
+	 * Handle building the nvgpu_page_alloc struct. We expect one sgl
+	 * to be present.
 	 */
 	alloc->slab_page = slab_page;
 	alloc->nr_chunks = 1;
 	alloc->length = slab_page->slab_size;
 	alloc->base = slab_page->page_addr + (offs * slab_page->slab_size);
 
-	chunk = nvgpu_list_first_entry(&alloc->alloc_chunks,
-				page_alloc_chunk, list_entry);
-	chunk->base = alloc->base;
-	chunk->length = alloc->length;
+	sgl         = alloc->sgl;
+	sgl->phys   = alloc->base;
+	sgl->dma    = alloc->base;
+	sgl->length = alloc->length;
+	sgl->next   = NULL;
 
 	return 0;
 }
@@ -327,7 +324,7 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
 	int err, slab_nr;
 	struct page_alloc_slab *slab;
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
+	struct nvgpu_mem_sgl *sgl = NULL;
 
 	/*
 	 * Align the length to a page and then divide by the page size (4k for
@@ -341,15 +338,13 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
 		palloc_dbg(a, "OOM: could not alloc page_alloc struct!\n");
 		goto fail;
 	}
-	chunk = nvgpu_kmem_cache_alloc(a->chunk_cache);
-	if (!chunk) {
-		palloc_dbg(a, "OOM: could not alloc alloc_chunk struct!\n");
+	sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
+	if (!sgl) {
+		palloc_dbg(a, "OOM: could not alloc sgl struct!\n");
 		goto fail;
 	}
 
-	nvgpu_init_list_node(&alloc->alloc_chunks);
-	nvgpu_list_add(&chunk->list_entry, &alloc->alloc_chunks);
-
+	alloc->sgl = sgl;
 	err = __do_slab_alloc(a, slab, alloc);
 	if (err)
 		goto fail;
@@ -363,8 +358,8 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
 fail:
 	if (alloc)
 		nvgpu_kmem_cache_free(a->alloc_cache, alloc);
-	if (chunk)
-		nvgpu_kmem_cache_free(a->chunk_cache, chunk);
+	if (sgl)
+		nvgpu_kfree(a->owner->g, sgl);
 	return NULL;
 }
 
@@ -426,7 +421,7 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
 	struct nvgpu_page_allocator *a, u64 pages)
 {
 	struct nvgpu_page_alloc *alloc;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl, *prev_sgl = NULL;
 	u64 max_chunk_len = pages << a->page_shift;
 	int i = 0;
 
@@ -436,7 +431,6 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
 
 	memset(alloc, 0, sizeof(*alloc));
 
-	nvgpu_init_list_node(&alloc->alloc_chunks);
 	alloc->length = pages << a->page_shift;
 
 	while (pages) {
@@ -482,36 +476,48 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
 			goto fail_cleanup;
 		}
 
-		c = nvgpu_kmem_cache_alloc(a->chunk_cache);
-		if (!c) {
+		sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
+		if (!sgl) {
 			nvgpu_free(&a->source_allocator, chunk_addr);
 			goto fail_cleanup;
 		}
 
 		pages -= chunk_pages;
 
-		c->base = chunk_addr;
-		c->length = chunk_len;
-		nvgpu_list_add(&c->list_entry, &alloc->alloc_chunks);
+		sgl->phys   = chunk_addr;
+		sgl->dma    = chunk_addr;
+		sgl->length = chunk_len;
+
+		/*
+		 * Build the singly linked list with a head node that is part of
+		 * the list.
+		 */
+		if (prev_sgl)
+			prev_sgl->next = sgl;
+		else
+			alloc->sgl = sgl;
+
+		prev_sgl = sgl;
 
 		i++;
 	}
 
 	alloc->nr_chunks = i;
-	c = nvgpu_list_first_entry(&alloc->alloc_chunks,
-				page_alloc_chunk, list_entry);
-	alloc->base = c->base;
+	alloc->base = alloc->sgl->phys;
 
 	return alloc;
 
 fail_cleanup:
-	while (!nvgpu_list_empty(&alloc->alloc_chunks)) {
-		c = nvgpu_list_first_entry(&alloc->alloc_chunks,
-				     page_alloc_chunk, list_entry);
-		nvgpu_list_del(&c->list_entry);
-		nvgpu_free(&a->source_allocator, c->base);
-		nvgpu_kmem_cache_free(a->chunk_cache, c);
+	sgl = alloc->sgl;
+	while (sgl) {
+		struct nvgpu_mem_sgl *next = sgl->next;
+
+		nvgpu_free(&a->source_allocator, sgl->phys);
+		nvgpu_kfree(a->owner->g, sgl);
+
+		sgl = next;
 	}
+
 	nvgpu_kmem_cache_free(a->alloc_cache, alloc);
 fail:
 	return NULL;
@@ -521,7 +527,7 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages(
 	struct nvgpu_page_allocator *a, u64 len)
 {
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl;
 	u64 pages;
 	int i = 0;
 
@@ -536,11 +542,15 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages(
 
 	palloc_dbg(a, "Alloc 0x%llx (%llu) id=0x%010llx\n",
 		   pages << a->page_shift, pages, alloc->base);
-	nvgpu_list_for_each_entry(c, &alloc->alloc_chunks,
-				  page_alloc_chunk, list_entry) {
+	sgl = alloc->sgl;
+	while (sgl) {
 		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
-			   i++, c->base, c->length);
+			   i++,
+			   nvgpu_mem_sgl_phys(sgl),
+			   nvgpu_mem_sgl_length(sgl));
+		sgl = sgl->next;
 	}
+	palloc_dbg(a, "Alloc done\n");
 
 	return alloc;
 }
@@ -638,11 +648,11 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages_fixed(
 	struct nvgpu_page_allocator *a, u64 base, u64 length, u32 unused)
 {
 	struct nvgpu_page_alloc *alloc;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl;
 
 	alloc = nvgpu_kmem_cache_alloc(a->alloc_cache);
-	c = nvgpu_kmem_cache_alloc(a->chunk_cache);
-	if (!alloc || !c)
+	sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
+	if (!alloc || !sgl)
 		goto fail;
 
 	alloc->base = nvgpu_alloc_fixed(&a->source_allocator, base, length, 0);
@@ -653,17 +663,18 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages_fixed(
 
 	alloc->nr_chunks = 1;
 	alloc->length = length;
-	nvgpu_init_list_node(&alloc->alloc_chunks);
+	alloc->sgl = sgl;
 
-	c->base = alloc->base;
-	c->length = length;
-	nvgpu_list_add(&c->list_entry, &alloc->alloc_chunks);
+	sgl->phys   = alloc->base;
+	sgl->dma    = alloc->base;
+	sgl->length = length;
+	sgl->next   = NULL;
 
 	return alloc;
 
 fail:
-	if (c)
-		nvgpu_kmem_cache_free(a->chunk_cache, c);
+	if (sgl)
+		nvgpu_kfree(a->owner->g, sgl);
 	if (alloc)
 		nvgpu_kmem_cache_free(a->alloc_cache, alloc);
 	return NULL;
@@ -677,7 +688,7 @@ static u64 nvgpu_page_alloc_fixed(struct nvgpu_allocator *__a,
 {
 	struct nvgpu_page_allocator *a = page_allocator(__a);
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl;
 	u64 aligned_len, pages;
 	int i = 0;
 
@@ -697,10 +708,13 @@ static u64 nvgpu_page_alloc_fixed(struct nvgpu_allocator *__a,
 
 	palloc_dbg(a, "Alloc [fixed] @ 0x%010llx + 0x%llx (%llu)\n",
 		   alloc->base, aligned_len, pages);
-	nvgpu_list_for_each_entry(c, &alloc->alloc_chunks,
-				  page_alloc_chunk, list_entry) {
+	sgl = alloc->sgl;
+	while (sgl) {
 		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
-			   i++, c->base, c->length);
+			   i++,
+			   nvgpu_mem_sgl_phys(sgl),
+			   nvgpu_mem_sgl_length(sgl));
+		sgl = sgl->next;
 	}
 
 	a->nr_fixed_allocs++;
@@ -896,11 +910,9 @@ int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
 
 	a->alloc_cache = nvgpu_kmem_cache_create(g,
 					sizeof(struct nvgpu_page_alloc));
-	a->chunk_cache = nvgpu_kmem_cache_create(g,
-					sizeof(struct page_alloc_chunk));
 	a->slab_page_cache = nvgpu_kmem_cache_create(g,
 					sizeof(struct page_alloc_slab_page));
-	if (!a->alloc_cache || !a->chunk_cache || !a->slab_page_cache) {
+	if (!a->alloc_cache || !a->slab_page_cache) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -941,8 +953,6 @@ int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
 fail:
 	if (a->alloc_cache)
 		nvgpu_kmem_cache_destroy(a->alloc_cache);
-	if (a->chunk_cache)
-		nvgpu_kmem_cache_destroy(a->chunk_cache);
 	if (a->slab_page_cache)
 		nvgpu_kmem_cache_destroy(a->slab_page_cache);
 	nvgpu_kfree(g, a);
diff --git a/drivers/gpu/nvgpu/common/pramin.c b/drivers/gpu/nvgpu/common/pramin.c
index 425bfdb4..bb7d930e 100644
--- a/drivers/gpu/nvgpu/common/pramin.c
+++ b/drivers/gpu/nvgpu/common/pramin.c
@@ -84,37 +84,40 @@ void nvgpu_pramin_access_batched(struct gk20a *g, struct nvgpu_mem *mem,
 		u32 offset, u32 size, pramin_access_batch_fn loop, u32 **arg)
 {
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
+	struct nvgpu_mem_sgl *sgl;
 	u32 byteoff, start_reg, until_end, n;
 
 	alloc = get_vidmem_page_alloc(mem->priv.sgt->sgl);
-	nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-			page_alloc_chunk, list_entry) {
-		if (offset >= chunk->length)
-			offset -= chunk->length;
-		else
+	sgl = alloc->sgl;
+	while (sgl) {
+		if (offset >= nvgpu_mem_sgl_length(sgl)) {
+			offset -= nvgpu_mem_sgl_length(sgl);
+			sgl = sgl->next;
+		} else {
 			break;
+		}
 	}
 
 	while (size) {
-		byteoff = g->ops.pramin.enter(g, mem, chunk,
+		u32 sgl_len = (u32)nvgpu_mem_sgl_length(sgl);
+
+		byteoff = g->ops.pramin.enter(g, mem, sgl,
 					      offset / sizeof(u32));
 		start_reg = g->ops.pramin.data032_r(byteoff / sizeof(u32));
 		until_end = SZ_1M - (byteoff & (SZ_1M - 1));
 
-		n = min3(size, until_end, (u32)(chunk->length - offset));
+		n = min3(size, until_end, (u32)(sgl_len - offset));
 
 		loop(g, start_reg, n / sizeof(u32), arg);
 
 		/* read back to synchronize accesses */
 		gk20a_readl(g, start_reg);
-		g->ops.pramin.exit(g, mem, chunk);
+		g->ops.pramin.exit(g, mem, sgl);
 
 		size -= n;
 
-		if (n == (chunk->length - offset)) {
-			chunk = nvgpu_list_next_entry(chunk, page_alloc_chunk,
-					list_entry);
+		if (n == (sgl_len - offset)) {
+			sgl = nvgpu_mem_sgl_next(sgl);
 			offset = 0;
 		} else {
 			offset += n;
-- 
cgit v1.2.2