From a2e852364582e9c337f52bc53ccc33877c8f3b47 Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Wed, 18 Mar 2015 13:33:09 -0700
Subject: gpu: nvgpu: New allocator for VA space

Implement a new buddy allocation scheme for the GPU's VA space.
The bitmap allocator was using too much memory and is not a scaleable
solution as the GPU's address space keeps getting bigger. The buddy
allocation scheme is much more memory efficient when the majority
of the address space is not allocated.

The buddy allocator is not constrained by the notion of a split
address space. The bitmap allocator could only manage either small
pages or large pages but not both at the same time. Thus the bottom
of the address space was for small pages, the top for large pages.
Although, that split is not removed quite yet, the new allocator
enables that to happen.

The buddy allocator is also very scalable. It manages the relatively
small comptag space to the enormous GPU VA space and everything in
between. This is important since the GPU has lots of different sized
spaces that need managing.

Currently there are certain limitations. For one the allocator does
not handle the fixed allocations from CUDA very well. It can do so
but with certain caveats. The PTE page size is always set to small.
This means the BA may place other small page allocations in the
buddies around the fixed allocation. It does this to avoid having
large and small page allocations in the same PDE.

Change-Id: I501cd15af03611536490137331d43761c402c7f9
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: http://git-master/r/740694
Reviewed-by: Automatic_Commit_Validation_User
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/pmu_gk20a.c | 68 ++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 39 deletions(-)

(limited to 'drivers/gpu/nvgpu/gk20a/pmu_gk20a.c')

diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 275fbd4e..fc8d130c 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2438,7 +2438,6 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
 	struct pmu_payload payload;
 	u32 seq;
 	u32 data;
-	int err = 0;
 
 	gk20a_dbg_fn("");
 
@@ -2489,12 +2488,11 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
 	gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data);
 
 	if (!pmu->sample_buffer)
-		err = pmu->dmem.alloc(&pmu->dmem,
-				      &pmu->sample_buffer, 2 * sizeof(u16),
-				      PMU_DMEM_ALLOC_ALIGNMENT);
-	if (err) {
+		pmu->sample_buffer = gk20a_balloc(&pmu->dmem,
+						  2 * sizeof(u16));
+	if (!pmu->sample_buffer) {
 		gk20a_err(dev_from_gk20a(g),
-			"failed to allocate perfmon sample buffer");
+			  "failed to allocate perfmon sample buffer");
 		return -ENOMEM;
 	}
 
@@ -2592,15 +2590,17 @@ static int pmu_process_init_msg(struct pmu_gk20a *pmu,
 	for (i = 0; i < PMU_QUEUE_COUNT; i++)
 		pmu_queue_init(pmu, i, init);
 
-	if (!pmu->dmem.alloc) {
-		/*Align start and end addresses*/
+	if (!pmu->dmem.init) {
+		/* Align start and end addresses */
 		u32 start = ALIGN(pv->get_pmu_init_msg_pmu_sw_mg_off(init),
-					PMU_DMEM_ALLOC_ALIGNMENT);
+				  PMU_DMEM_ALLOC_ALIGNMENT);
 		u32 end = (pv->get_pmu_init_msg_pmu_sw_mg_off(init) +
-			pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
+			   pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
 			~(PMU_DMEM_ALLOC_ALIGNMENT - 1);
 		u32 size = end - start;
-		gk20a_allocator_init(&pmu->dmem, "gk20a_pmu_dmem", start, size);
+		__gk20a_allocator_init(&pmu->dmem, NULL, "gk20a_pmu_dmem",
+				       start, size,
+				       PMU_DMEM_ALLOC_ALIGNMENT, 4, 0);
 	}
 
 	pmu->pmu_ready = true;
@@ -2737,20 +2737,14 @@ static int pmu_response_handle(struct pmu_gk20a *pmu,
 		seq->callback = NULL;
 	if (pv->pmu_allocation_get_dmem_size(pmu,
 			pv->get_pmu_seq_in_a_ptr(seq)) != 0)
-		pmu->dmem.free(&pmu->dmem,
+		gk20a_bfree(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu,
-			pv->get_pmu_seq_in_a_ptr(seq)),
-			pv->pmu_allocation_get_dmem_size(pmu,
-				pv->get_pmu_seq_in_a_ptr(seq)),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+			pv->get_pmu_seq_in_a_ptr(seq)));
 	if (pv->pmu_allocation_get_dmem_size(pmu,
 			pv->get_pmu_seq_out_a_ptr(seq)) != 0)
-		pmu->dmem.free(&pmu->dmem,
+		gk20a_bfree(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu,
-			pv->get_pmu_seq_out_a_ptr(seq)),
-			pv->pmu_allocation_get_dmem_size(pmu,
-				pv->get_pmu_seq_out_a_ptr(seq)),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+			pv->get_pmu_seq_out_a_ptr(seq)));
 
 	if (seq->callback)
 		seq->callback(g, msg, seq->cb_params, seq->desc, ret);
@@ -3387,11 +3381,10 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 			pv->pmu_allocation_set_dmem_size(pmu, in,
 			(u16)max(payload->in.size, payload->out.size));
 
-		err = pmu->dmem.alloc(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset_addr(pmu, in),
-			pv->pmu_allocation_get_dmem_size(pmu, in),
-			PMU_DMEM_ALLOC_ALIGNMENT);
-		if (err)
+		*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)) =
+			gk20a_balloc(&pmu->dmem,
+				     pv->pmu_allocation_get_dmem_size(pmu, in));
+		if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)))
 			goto clean_up;
 
 		pmu_copy_to_dmem(pmu, (pv->pmu_allocation_get_dmem_offset(pmu,
@@ -3412,11 +3405,12 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 		(u16)payload->out.size);
 
 		if (payload->out.buf != payload->in.buf) {
-			err = pmu->dmem.alloc(&pmu->dmem,
-				pv->pmu_allocation_get_dmem_offset_addr(pmu, out),
-				pv->pmu_allocation_get_dmem_size(pmu, out),
-				PMU_DMEM_ALLOC_ALIGNMENT);
-			if (err)
+
+			*(pv->pmu_allocation_get_dmem_offset_addr(pmu, out)) =
+				gk20a_balloc(&pmu->dmem,
+				    pv->pmu_allocation_get_dmem_size(pmu, out));
+			if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu,
+								       out)))
 				goto clean_up;
 		} else {
 			BUG_ON(in == NULL);
@@ -3444,15 +3438,11 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 clean_up:
 	gk20a_dbg_fn("fail");
 	if (in)
-		pmu->dmem.free(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset(pmu, in),
-			pv->pmu_allocation_get_dmem_size(pmu, in),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+		gk20a_bfree(&pmu->dmem,
+			pv->pmu_allocation_get_dmem_offset(pmu, in));
 	if (out)
-		pmu->dmem.free(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset(pmu, out),
-			pv->pmu_allocation_get_dmem_size(pmu, out),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+		gk20a_bfree(&pmu->dmem,
+			pv->pmu_allocation_get_dmem_offset(pmu, out));
 
 	pmu_seq_release(pmu, seq);
 	return err;
-- 
cgit v1.2.2