From a99bbc5f6070a346006cf3f63e7f5f2120f30a2f Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Thu, 11 Dec 2014 11:33:52 -0800
Subject: gpu: nvgpu: make larger address space work

Implement several fixes for allowing the GVA address space to grow
to larger than 32GB and increase the address space to 128GB.

 o  Implement dynamic allocation of PDE backing pages. The memory
    to store the PDE entries was hard coded to 1 page. Now the
    number of pages necessary is computed dynamically based on the
    size of the address space and the size of large pages.

 o  Fix an arithmetic problem in the gm20b sparse texture code
    that caused large address spaces to be truncated when sparse
    PDEs/PTEs were being filled in. This caused a kernel panic
    when freeing the address space since a lot of the backing
    PTE memory was not allocated.

 o  Change the address space split for large and small pages. Small
    pages now occupy the bottom 16GB of the address space. Large
    pages are used for the rest of the address space. Now, with a
    128GB address space, there are 112GB of large page GVA available.

This patch exists to allow large (16GB) sparse textures to be allocated
without running into lack of memory issues and kernel panics.

Bug 1574267

Change-Id: I7c59ee54bd573dfc53b58c346156df37a85dfc22
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: http://git-master/r/671204
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 45 +++++++++++++++++++++++++-------------
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 11 ++++++----
 drivers/gpu/nvgpu/gm20b/mm_gm20b.c |  2 +-
 3 files changed, 38 insertions(+), 20 deletions(-)

(limited to 'drivers/gpu')

diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 949237b1..09948a25 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1811,6 +1811,8 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 			goto clean_up;
 		}
 
+		BUG_ON(!pte_kv_cur);
+
 		gk20a_dbg(gpu_dbg_pte, "pte_lo=%d, pte_hi=%d", pte_lo, pte_hi);
 		for (pte_cur = pte_lo; pte_cur <= pte_hi; pte_cur++) {
 			if (likely(sgt)) {
@@ -2128,9 +2130,10 @@ static int gk20a_init_vm(struct mm_gk20a *mm,
 		char *name)
 {
 	int err, i;
-	u32 num_pages, low_hole_pages;
+	u32 num_small_pages, num_large_pages, low_hole_pages;
 	char alloc_name[32];
-	u64 vma_size;
+	u64 small_vma_size, large_vma_size;
+	u32 pde_pages;
 
 	/* note: keep the page sizes sorted lowest to highest here */
 	u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, big_page_size };
@@ -2206,7 +2209,10 @@ static int gk20a_init_vm(struct mm_gk20a *mm,
 		   name, vm->va_limit, vm->pdes.num_pdes);
 
 	/* allocate the page table directory */
-	err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
+	pde_pages = ilog2((vm->pdes.num_pdes + 511) / 512);
+
+	gk20a_dbg(gpu_dbg_pte, "Allocating %d ** 2 PDE pages\n", pde_pages);
+	err = alloc_gmmu_pages(vm, pde_pages, &vm->pdes.ref,
 			       &vm->pdes.sgt, &vm->pdes.size);
 	if (err)
 		goto clean_up_pdes;
@@ -2220,13 +2226,15 @@ static int gk20a_init_vm(struct mm_gk20a *mm,
 		  vm->pdes.kv, gk20a_mm_iova_addr(vm->mm->g, vm->pdes.sgt->sgl));
 	/* we could release vm->pdes.kv but it's only one page... */
 
-	/* low-half: alloc small pages */
-	/* high-half: alloc big pages */
-	vma_size = vm->va_limit;
-	if (big_pages)
-		vma_size /= 2;
+	/* First 16GB of the address space goes towards small pages. What ever
+	 * remains is allocated to large pages. */
+	small_vma_size = vm->va_limit;
+	if (big_pages) {
+		small_vma_size = (u64)16 << 30;
+		large_vma_size = vm->va_limit - small_vma_size;
+	}
 
-	num_pages = (u32)(vma_size >>
+	num_small_pages = (u32)(small_vma_size >>
 		    ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
 
 	/* num_pages above is without regard to the low-side hole. */
@@ -2238,20 +2246,22 @@ static int gk20a_init_vm(struct mm_gk20a *mm,
 	err = gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
 			     alloc_name,
 			     low_hole_pages,		 /*start*/
-			     num_pages - low_hole_pages);/* length*/
+			     num_small_pages - low_hole_pages);/* length*/
 	if (err)
 		goto clean_up_map_pde;
 
 	if (big_pages) {
-		num_pages = (u32)((vm->va_limit / 2) >>
+		u32 start = (u32)(small_vma_size >>
+			    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
+		num_large_pages = (u32)(large_vma_size >>
 			    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
 
 		snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB",
 			 name, vm->gmmu_page_sizes[gmmu_page_size_big]>>10);
 		err = gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
 				      alloc_name,
-				      num_pages,		/* start */
-				      num_pages);		/* length */
+				      start,			/* start */
+				      num_large_pages);		/* length */
 		if (err)
 			goto clean_up_small_allocator;
 	}
@@ -2269,7 +2279,7 @@ clean_up_small_allocator:
 clean_up_map_pde:
 	unmap_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, vm->pdes.kv);
 clean_up_ptes:
-	free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
+	free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, pde_pages,
 			vm->pdes.size);
 clean_up_pdes:
 	kfree(vm->pdes.ptes[gmmu_page_size_small]);
@@ -2647,10 +2657,15 @@ int gk20a_vm_unmap_buffer(struct gk20a_as_share *as_share, u64 offset)
 
 static void gk20a_deinit_vm(struct vm_gk20a *vm)
 {
+	u32 pde_pages;
+
 	gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]);
 	gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
+
 	unmap_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, vm->pdes.kv);
-	free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
+
+	pde_pages = ilog2((vm->pdes.num_pdes + 511) / 512);
+	free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, pde_pages,
 			vm->pdes.size);
 	kfree(vm->pdes.ptes[gmmu_page_size_small]);
 	kfree(vm->pdes.ptes[gmmu_page_size_big]);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 04f9446b..d39dcff0 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -25,10 +25,13 @@
 #include <asm/cacheflush.h>
 #include "gk20a_allocator.h"
 
-/* For now keep the size relatively small-ish compared to the full
- * 40b va.  32GB for now. It consists of two 16GB spaces. */
-#define NV_GMMU_VA_RANGE	35ULL
-#define NV_GMMU_VA_IS_UPPER(x)	((x) >= ((u64)0x1 << (NV_GMMU_VA_RANGE-1)))
+/*
+ * Amount of the GVA space we actually use is smaller than the available space.
+ * The bottom 16GB of the space are used for small pages, the remaining high
+ * memory is for large pages.
+ */
+#define NV_GMMU_VA_RANGE	37ULL
+#define NV_GMMU_VA_IS_UPPER(x)	((x) >= ((u64)SZ_1G * 16))
 
 #ifdef CONFIG_ARM64
 #define outer_flush_range(a, b)
diff --git a/drivers/gpu/nvgpu/gm20b/mm_gm20b.c b/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
index 5b1a9a04..1adff5ab 100644
--- a/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
@@ -111,7 +111,7 @@ static int gm20b_vm_put_sparse(struct vm_gk20a *vm, u64 vaddr,
 
 	gk20a_dbg_fn("");
 
-	vaddr_hi = vaddr + pgsz * num_pages - 1;
+	vaddr_hi = vaddr + pgsz * (u64)num_pages - 1;
 	pde_range_from_vaddr_range(vm,
 				   vaddr,
 				   vaddr_hi,
-- 
cgit v1.2.2