From db7095ce5180552d1a70fdea779e5987d55cce7b Mon Sep 17 00:00:00 2001
From: Konsta Holtta <kholtta@nvidia.com>
Date: Tue, 1 Dec 2015 11:55:27 +0200
Subject: gpu: nvgpu: bitmap allocator for comptags

Restore comptags to be bitmap-allocated, like they were before we had
the buddy allocator.

The new buddy allocator introduced by
e99aa2485f8992eabe3556f3ebcb57bdc8ad91ff (originally
6ab2e0c49cb79ca68d2f83f1d4610783d2eaa79b) is fine for the big VAs, but
unsuitable for the small compbit store.

This commit reverts partially the combination of the above commit and
also one after it, 86fc7ec9a05999bea8de320840b962db3ee11410, that fixed
a bug which is not present when using a bitmap. With a bitmap allocator,
pruning the extra allocation necessary for user-mapped mode is possible,
so that is also restored.

The original generic bitmap allocator is not restored; instead, a
comptag-only allocator is introduced.

Bug 200145635

Change-Id: I87f3a911826a801124cfd21e44857dfab1c3f378
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: http://git-master/r/837180
(cherry picked from commit 5a504aeb54f3e89e6561932971158a397157b3f2)
Reviewed-on: http://git-master/r/839742
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c  |  29 +++++-
 drivers/gpu/nvgpu/gk20a/gr_gk20a.h  |  12 ++-
 drivers/gpu/nvgpu/gk20a/ltc_gk20a.c |   5 +-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c  | 170 ++++++++++++++++++++++++------------
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h  |   1 -
 drivers/gpu/nvgpu/gm20b/ltc_gm20b.c |   6 +-
 drivers/gpu/nvgpu/vgpu/gr_vgpu.c    |   2 +-
 drivers/gpu/nvgpu/vgpu/ltc_vgpu.c   |   6 +-
 8 files changed, 163 insertions(+), 68 deletions(-)

diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 55262a8f..f31f7170 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -2878,6 +2878,33 @@ int gk20a_free_obj_ctx(struct channel_gk20a  *c,
 	return 0;
 }
 
+int gk20a_comptag_allocator_init(struct gk20a_comptag_allocator *allocator,
+		unsigned long size)
+{
+	mutex_init(&allocator->lock);
+	/*
+	 * 0th comptag is special and is never used. The base for this bitmap
+	 * is 1, and its size is one less than the size of comptag store.
+	 */
+	size--;
+	allocator->bitmap = vzalloc(BITS_TO_LONGS(size) * sizeof(long));
+	if (!allocator->bitmap)
+		return -ENOMEM;
+	allocator->size = size;
+	return 0;
+}
+
+void gk20a_comptag_allocator_destroy(struct gk20a_comptag_allocator *allocator)
+{
+	/*
+	 * called only when exiting the driver (gk20a_remove, or unwinding the
+	 * init stage); no users should be active, so taking the mutex is
+	 * unnecessary here.
+	 */
+	allocator->size = 0;
+	vfree(allocator->bitmap);
+}
+
 static void gk20a_remove_gr_support(struct gr_gk20a *gr)
 {
 	struct gk20a *g = gr->g;
@@ -2936,7 +2963,7 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr)
 	kfree(gr->ctx_vars.local_golden_image);
 	gr->ctx_vars.local_golden_image = NULL;
 
-	gk20a_allocator_destroy(&gr->comp_tags);
+	gk20a_comptag_allocator_destroy(&gr->comp_tags);
 }
 
 static void gr_gk20a_bundle_cb_defaults(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 94d7c811..c7100182 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -281,7 +281,13 @@ struct gr_gk20a {
 
 	u32 max_comptag_mem; /* max memory size (MB) for comptag */
 	struct compbit_store_desc compbit_store;
-	struct gk20a_allocator comp_tags;
+	struct gk20a_comptag_allocator {
+		struct mutex lock;
+		/* this bitmap starts at ctag 1. 0th cannot be taken */
+		unsigned long *bitmap;
+		/* size of bitmap, not max ctags, so one less */
+		unsigned long size;
+	} comp_tags;
 
 	struct gr_zcull_gk20a zcull;
 
@@ -400,6 +406,10 @@ int gk20a_init_gr_support(struct gk20a *g);
 int gk20a_enable_gr_hw(struct gk20a *g);
 int gk20a_gr_reset(struct gk20a *g);
 void gk20a_gr_wait_initialized(struct gk20a *g);
+/* real size here, but first (ctag 0) isn't used */
+int gk20a_comptag_allocator_init(struct gk20a_comptag_allocator *allocator,
+		unsigned long size);
+void gk20a_comptag_allocator_destroy(struct gk20a_comptag_allocator *allocator);
 
 int gk20a_init_gr_channel(struct channel_gk20a *ch_gk20a);
 
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
index c7f9a55d..c6ff07da 100644
--- a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
@@ -89,8 +89,9 @@ static int gk20a_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
 	if (err)
 		return err;
 
-	__gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
-			       1, max_comptag_lines - 1, 1, 10, 0);
+	err = gk20a_comptag_allocator_init(&gr->comp_tags, max_comptag_lines);
+	if (err)
+		return err;
 
 	gr->comptags_per_cacheline = comptags_per_cacheline;
 	gr->slices_per_ltc = slices_per_fbp / g->ltc_count;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 76c33512..e79cc1d1 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -111,7 +111,7 @@ static int __must_check gk20a_init_cde_vm(struct mm_gk20a *mm);
 struct gk20a_dmabuf_priv {
 	struct mutex lock;
 
-	struct gk20a_allocator *comptag_allocator;
+	struct gk20a_comptag_allocator *comptag_allocator;
 	struct gk20a_comptags comptags;
 
 	struct dma_buf_attachment *attach;
@@ -126,6 +126,41 @@ struct gk20a_dmabuf_priv {
 
 static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm);
 
+static int gk20a_comptaglines_alloc(struct gk20a_comptag_allocator *allocator,
+		u32 *offset, u32 len)
+{
+	unsigned long addr;
+	int err = 0;
+
+	mutex_lock(&allocator->lock);
+	addr = bitmap_find_next_zero_area(allocator->bitmap, allocator->size,
+			0, len, 0);
+	if (addr < allocator->size) {
+		/* number zero is reserved; bitmap base is 1 */
+		*offset = 1 + addr;
+		bitmap_set(allocator->bitmap, addr, len);
+	} else {
+		err = -ENOMEM;
+	}
+	mutex_unlock(&allocator->lock);
+
+	return err;
+}
+
+static void gk20a_comptaglines_free(struct gk20a_comptag_allocator *allocator,
+		u32 offset, u32 len)
+{
+	/* number zero is reserved; bitmap base is 1 */
+	u32 addr = offset - 1;
+	WARN_ON(offset == 0);
+	WARN_ON(addr > allocator->size);
+	WARN_ON(addr + len > allocator->size);
+
+	mutex_lock(&allocator->lock);
+	bitmap_clear(allocator->bitmap, addr, len);
+	mutex_unlock(&allocator->lock);
+}
+
 static void gk20a_mm_delete_priv(void *_priv)
 {
 	struct gk20a_buffer_state *s, *s_tmp;
@@ -135,8 +170,9 @@ static void gk20a_mm_delete_priv(void *_priv)
 
 	if (priv->comptags.lines) {
 		BUG_ON(!priv->comptag_allocator);
-		gk20a_bfree(priv->comptag_allocator,
-			    priv->comptags.real_offset);
+		gk20a_comptaglines_free(priv->comptag_allocator,
+				priv->comptags.offset,
+				priv->comptags.allocated_lines);
 	}
 
 	/* Free buffer states */
@@ -221,19 +257,21 @@ void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf,
 static int gk20a_alloc_comptags(struct gk20a *g,
 				struct device *dev,
 				struct dma_buf *dmabuf,
-				struct gk20a_allocator *allocator,
+				struct gk20a_comptag_allocator *allocator,
 				u32 lines, bool user_mappable,
 				u64 *ctag_map_win_size,
 				u32 *ctag_map_win_ctagline)
 {
 	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
-	u32 ctaglines_to_allocate;
-	u32 ctagline_align = 1;
+	u32 ctaglines_allocsize;
+	u32 ctagline_align;
 	u32 offset;
+	u32 alignment_lines;
 	const u32 aggregate_cacheline_sz =
 		g->gr.cacheline_size * g->gr.slices_per_ltc *
 		g->ltc_count;
 	const u32 small_pgsz = 4096;
+	int err;
 
 	if (!priv)
 		return -ENOSYS;
@@ -242,17 +280,19 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 		return -EINVAL;
 
 	if (!user_mappable) {
-		ctaglines_to_allocate = lines;
+		ctaglines_allocsize = lines;
+		ctagline_align = 1;
 	} else {
-		/* Unfortunately, we cannot use allocation alignment
-		 * here, since compbits per cacheline is not always a
-		 * power of two. So, we just have to allocate enough
-		 * extra that we're guaranteed to find a ctagline
-		 * inside the allocation so that: 1) it is the first
-		 * ctagline in a cacheline that starts at a page
-		 * boundary, and 2) we can add enough overallocation
-		 * that the ctaglines of the succeeding allocation
-		 * are on different page than ours
+		/*
+		 * For security, align the allocation on a page, and reserve
+		 * whole pages. Unfortunately, we cannot ask the allocator to
+		 * align here, since compbits per cacheline is not always a
+		 * power of two. So, we just have to allocate enough extra that
+		 * we're guaranteed to find a ctagline inside the allocation so
+		 * that: 1) it is the first ctagline in a cacheline that starts
+		 * at a page boundary, and 2) we can add enough overallocation
+		 * that the ctaglines of the succeeding allocation are on
+		 * different page than ours.
 		 */
 
 		ctagline_align =
@@ -260,7 +300,7 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 			 aggregate_cacheline_sz) *
 			g->gr.comptags_per_cacheline;
 
-		ctaglines_to_allocate =
+		ctaglines_allocsize =
 			/* for alignment */
 			ctagline_align +
 
@@ -272,37 +312,71 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 			DIV_ROUND_UP(aggregate_cacheline_sz, small_pgsz) *
 			g->gr.comptags_per_cacheline;
 
-		if (ctaglines_to_allocate < lines)
+		if (ctaglines_allocsize < lines)
 			return -EINVAL; /* integer overflow */
 	}
 
 	/* store the allocator so we can use it when we free the ctags */
 	priv->comptag_allocator = allocator;
-	offset = gk20a_balloc(allocator, ctaglines_to_allocate);
-	if (!offset)
-		return -ENOMEM;
+	err = gk20a_comptaglines_alloc(allocator, &offset,
+			       ctaglines_allocsize);
+	if (err)
+		return err;
 
-	priv->comptags.lines = lines;
-	priv->comptags.real_offset = offset;
-	priv->comptags.allocated_lines = ctaglines_to_allocate;
+	/* 
+	 * offset needs to be at the start of a page/cacheline boundary;
+	 * prune the preceding ctaglines that were allocated for alignment.
+	 */
+	alignment_lines =
+		DIV_ROUND_UP(offset, ctagline_align) * ctagline_align - offset;
+	if (alignment_lines) {
+		gk20a_comptaglines_free(allocator, offset, alignment_lines);
+		offset += alignment_lines;
+		ctaglines_allocsize -= alignment_lines;
+	}
 
+	/*
+	 * check if we can prune the trailing, too; we just need to reserve
+	 * whole pages and ctagcachelines.
+	 */
 	if (user_mappable) {
-		u64 win_size =
+		u32 needed_cachelines =
+			DIV_ROUND_UP(lines, g->gr.comptags_per_cacheline);
+		u32 needed_bytes = round_up(needed_cachelines *
+					    aggregate_cacheline_sz,
+					    small_pgsz);
+		u32 first_unneeded_cacheline =
+			DIV_ROUND_UP(needed_bytes, aggregate_cacheline_sz);
+		u32 needed_ctaglines = first_unneeded_cacheline *
+			g->gr.comptags_per_cacheline;
+		u64 win_size;
+
+		if (needed_ctaglines < ctaglines_allocsize) {
+			gk20a_comptaglines_free(allocator,
+				offset + needed_ctaglines,
+				ctaglines_allocsize - needed_ctaglines);
+			ctaglines_allocsize = needed_ctaglines;
+		}
+
+		*ctag_map_win_ctagline = offset;
+		win_size =
 			DIV_ROUND_UP(lines, g->gr.comptags_per_cacheline) *
 			aggregate_cacheline_sz;
-		win_size = roundup(win_size, small_pgsz);
 
-		offset = DIV_ROUND_UP(offset, ctagline_align) * ctagline_align;
-		*ctag_map_win_ctagline = offset;
-		*ctag_map_win_size = win_size;
+		*ctag_map_win_size = round_up(win_size, small_pgsz);
 	}
 
-
 	priv->comptags.offset = offset;
+	priv->comptags.lines = lines;
+	priv->comptags.allocated_lines = ctaglines_allocsize;
+	priv->comptags.user_mappable = user_mappable;
 
 	return 0;
 }
 
+
+
+
 static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
 {
 	gk20a_dbg_fn("");
@@ -1412,7 +1486,7 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
 			struct vm_gk20a_mapping_batch *batch)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
-	struct gk20a_allocator *ctag_allocator = &g->gr.comp_tags;
+	struct gk20a_comptag_allocator *ctag_allocator = &g->gr.comp_tags;
 	struct device *d = dev_from_vm(vm);
 	struct mapped_buffer_node *mapped_buffer = NULL;
 	bool inserted = false, va_allocated = false;
@@ -1579,32 +1653,14 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
 			gk20a_get_comptags(d, dmabuf, &comptags);
 			clear_ctags = true;
 
-			comptags.user_mappable = user_mappable;
-
-			if (user_mappable) {
-				/* comptags for the buffer will be
-				   cleared later, but we need to make
-				   sure the whole comptags allocation
-				   (which may be bigger) is cleared in
-				   order not to leak compbits */
-
-				const u32 buffer_ctag_end =
-					comptags.offset + comptags.lines;
-				const u32 alloc_ctag_end =
-					comptags.real_offset +
-					comptags.allocated_lines;
-
-				if (comptags.real_offset < comptags.offset)
-					g->ops.ltc.cbc_ctrl(
-						g, gk20a_cbc_op_clear,
-						comptags.real_offset,
-						comptags.offset - 1);
-
-				if (buffer_ctag_end < alloc_ctag_end)
-					g->ops.ltc.cbc_ctrl(
-						g, gk20a_cbc_op_clear,
-						buffer_ctag_end,
-						alloc_ctag_end - 1);
+			if (comptags.lines < comptags.allocated_lines) {
+				/* clear tail-padding comptags */
+				u32 ctagmin = comptags.offset + comptags.lines;
+				u32 ctagmax = comptags.offset +
+					comptags.allocated_lines - 1;
+
+				g->ops.ltc.cbc_ctrl(g, gk20a_cbc_op_clear,
+						    ctagmin, ctagmax);
 			}
 		}
 	}
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 5ce931c3..b8b0ca49 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -137,7 +137,6 @@ enum gmmu_pgsz_gk20a {
 };
 
 struct gk20a_comptags {
-	u32 real_offset;
 	u32 offset;
 	u32 lines;
 	u32 allocated_lines;
diff --git a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
index 9c250a7d..5b6bff7f 100644
--- a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
@@ -25,7 +25,6 @@
 
 #include "gk20a/ltc_common.c"
 #include "gk20a/gk20a.h"
-#include "gk20a/gk20a_allocator.h"
 
 
 static int gm20b_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
@@ -90,8 +89,9 @@ static int gm20b_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
 	if (err)
 		return err;
 
-	__gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
-			       1, max_comptag_lines - 1, 1, 10, 0);
+	err = gk20a_comptag_allocator_init(&gr->comp_tags, max_comptag_lines);
+	if (err)
+		return err;
 
 	gr->comptags_per_cacheline = comptags_per_cacheline;
 	gr->slices_per_ltc = slices_per_ltc;
diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
index 4a22441c..2b98dc50 100644
--- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
@@ -795,7 +795,7 @@ static void vgpu_remove_gr_support(struct gr_gk20a *gr)
 {
 	gk20a_dbg_fn("");
 
-	gk20a_allocator_destroy(&gr->comp_tags);
+	gk20a_comptag_allocator_destroy(&gr->comp_tags);
 
 	kfree(gr->gpc_tpc_mask);
 	gr->gpc_tpc_mask = NULL;
diff --git a/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c b/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
index 199e880b..76ee5ec9 100644
--- a/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
@@ -56,8 +56,10 @@ static int vgpu_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
 	if (max_comptag_lines < 2)
 		return -ENXIO;
 
-	__gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
-			       1, max_comptag_lines - 1, 1, 10, 0); /* length*/
+	err = gk20a_comptag_allocator_init(&gr->comp_tags, max_comptag_lines);
+	if (err)
+		return err;
+
 	return 0;
 }
 
-- 
cgit v1.2.2