From 246dcb824bc299b836be92fb74416c07a57e085b Mon Sep 17 00:00:00 2001
From: Sachit Kadle <skadle@nvidia.com>
Date: Mon, 19 Sep 2016 14:18:11 -0700
Subject: gpu: nvgpu: improve sync create/destroy logic

This change improves the aggressive sync creation
& destruction logic to avoid lock contention in
the submit path. It does the following:

1) Removes the global sync destruction (channel)
threshold, and adds a per-platform parameter.

2) Avoids lock contention in the clean-up/submit
path when aggressive sync destruction is disabled.

3) Creates sync object at gpfifo
allocation time (as long as we are not in aggressive
sync destroy mode), to enable faster first submits

Bug 1795076

Change-Id: Ifdb680100b08d00f37338063355bb2123ceb1b9f
Signed-off-by: Sachit Kadle <skadle@nvidia.com>
Reviewed-on: http://git-master/r/1202425
(cherry picked from commit ac0978711943a59c6f28c98c76b10759e0bff610)
Reviewed-on: http://git-master/r/1202427
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c        | 77 ++++++++++++++++++--------
 drivers/gpu/nvgpu/gk20a/platform_gk20a.h       |  3 +
 drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c |  2 +
 drivers/gpu/nvgpu/gk20a/platform_vgpu_tegra.c  |  1 +
 4 files changed, 59 insertions(+), 24 deletions(-)

(limited to 'drivers')

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 7df794bf..79d449e6 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -43,8 +43,6 @@
 
 #define NVMAP_HANDLE_PARAM_SIZE 1
 
-#define NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT	64	/* channels */
-
 #define NVGPU_CHANNEL_MIN_TIMESLICE_US 1000
 #define NVGPU_CHANNEL_MAX_TIMESLICE_US 50000
 
@@ -91,7 +89,9 @@ static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
 	}
 	mutex_unlock(&f->free_chs_mutex);
 
-	if (f->used_channels > NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT)
+	if (platform->aggressive_sync_destroy_thresh &&
+			(f->used_channels >
+			 platform->aggressive_sync_destroy_thresh))
 		platform->aggressive_sync_destroy = true;
 
 	return ch;
@@ -110,7 +110,9 @@ static void free_channel(struct fifo_gk20a *f,
 	f->used_channels--;
 	mutex_unlock(&f->free_chs_mutex);
 
-	if (f->used_channels < NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT)
+	if (platform->aggressive_sync_destroy_thresh &&
+			(f->used_channels <
+			 platform->aggressive_sync_destroy_thresh))
 		platform->aggressive_sync_destroy = false;
 }
 
@@ -1424,6 +1426,7 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
 {
 	struct gk20a *g = c->g;
 	struct device *d = dev_from_gk20a(g);
+	struct gk20a_platform *platform = gk20a_get_platform(d);
 	struct vm_gk20a *ch_vm;
 	u32 gpfifo_size;
 	int err = 0;
@@ -1487,26 +1490,46 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
 
 	channel_gk20a_setup_userd(c);
 
+	if (!platform->aggressive_sync_destroy_thresh) {
+		mutex_lock(&c->sync_lock);
+		c->sync = gk20a_channel_sync_create(c);
+		if (!c->sync) {
+			err = -ENOMEM;
+			mutex_unlock(&c->sync_lock);
+			goto clean_up_unmap;
+		}
+		mutex_unlock(&c->sync_lock);
+
+		if (g->ops.fifo.resetup_ramfc) {
+			err = g->ops.fifo.resetup_ramfc(c);
+			if (err)
+				goto clean_up_sync;
+		}
+	}
+
 	err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,
 					c->gpfifo.entry_num, args->flags);
 	if (err)
-		goto clean_up_unmap;
+		goto clean_up_sync;
 
 	/* TBD: setup engine contexts */
 
 	err = channel_gk20a_alloc_priv_cmdbuf(c);
 	if (err)
-		goto clean_up_unmap;
+		goto clean_up_sync;
 
 	err = channel_gk20a_update_runlist(c, true);
 	if (err)
-		goto clean_up_unmap;
+		goto clean_up_sync;
 
 	g->ops.fifo.bind_channel(c);
 
 	gk20a_dbg_fn("done");
 	return 0;
 
+clean_up_sync:
+	gk20a_channel_sync_destroy(c->sync);
+	c->sync = NULL;
 clean_up_unmap:
 	nvgpu_free(c->gpfifo.pipe);
 	gk20a_gmmu_unmap_free(ch_vm, &c->gpfifo.mem);
@@ -1911,18 +1934,21 @@ static void gk20a_channel_clean_up_jobs(struct work_struct *work)
 
 		gk20a_channel_timeout_stop(c);
 
-		mutex_lock(&c->sync_lock);
+		WARN_ON(!c->sync);
+
 		if (c->sync) {
 			c->sync->signal_timeline(c->sync);
-			if (atomic_dec_and_test(&c->sync->refcount) &&
-					platform->aggressive_sync_destroy) {
-				gk20a_channel_sync_destroy(c->sync);
-				c->sync = NULL;
+
+			if (platform->aggressive_sync_destroy_thresh) {
+				mutex_lock(&c->sync_lock);
+				if (atomic_dec_and_test(&c->sync->refcount) &&
+						platform->aggressive_sync_destroy) {
+					gk20a_channel_sync_destroy(c->sync);
+					c->sync = NULL;
+				}
+				mutex_unlock(&c->sync_lock);
 			}
-		} else {
-			WARN_ON(1);
 		}
-		mutex_unlock(&c->sync_lock);
 
 		if (job->num_mapped_buffers)
 			gk20a_vm_put_buffers(vm, job->mapped_buffers,
@@ -2099,6 +2125,7 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 				      u32 flags)
 {
 	struct gk20a *g = c->g;
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
 	bool need_sync_fence = false;
 	bool new_sync_created = false;
 	int wait_fence_fd = -1;
@@ -2112,18 +2139,20 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 	if (force_need_sync_fence)
 		need_sync_fence = true;
 
-	mutex_lock(&c->sync_lock);
-	if (!c->sync) {
-		c->sync = gk20a_channel_sync_create(c);
+	if (platform->aggressive_sync_destroy_thresh) {
+		mutex_lock(&c->sync_lock);
 		if (!c->sync) {
-			err = -ENOMEM;
-			mutex_unlock(&c->sync_lock);
-			goto fail;
+			c->sync = gk20a_channel_sync_create(c);
+			if (!c->sync) {
+				err = -ENOMEM;
+				mutex_unlock(&c->sync_lock);
+				goto fail;
+			}
+			new_sync_created = true;
 		}
-		new_sync_created = true;
+		atomic_inc(&c->sync->refcount);
+		mutex_unlock(&c->sync_lock);
 	}
-	atomic_inc(&c->sync->refcount);
-	mutex_unlock(&c->sync_lock);
 
 	if (g->ops.fifo.resetup_ramfc && new_sync_created) {
 		err = g->ops.fifo.resetup_ramfc(c);
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
index 93158cc7..f038b072 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
@@ -50,6 +50,9 @@ struct gk20a_platform {
 	/* Should be populated at probe. */
 	bool has_syncpoints;
 
+	/* channel limit after which to start aggressive sync destroy */
+	int aggressive_sync_destroy_thresh;
+
 	/* flag to set sync destroy aggressiveness */
 	bool aggressive_sync_destroy;
 
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
index 819c50a4..90ba54ea 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
@@ -847,6 +847,7 @@ static int gk20a_clk_get_freqs(struct device *dev,
 
 struct gk20a_platform gk20a_tegra_platform = {
 	.has_syncpoints = true,
+	.aggressive_sync_destroy_thresh = 64,
 
 	/* power management configuration */
 	.railgate_delay		= 500,
@@ -909,6 +910,7 @@ struct gk20a_platform gk20a_tegra_platform = {
 
 struct gk20a_platform gm20b_tegra_platform = {
 	.has_syncpoints = true,
+	.aggressive_sync_destroy_thresh = 64,
 
 	/* power management configuration */
 	.railgate_delay		= 500,
diff --git a/drivers/gpu/nvgpu/gk20a/platform_vgpu_tegra.c b/drivers/gpu/nvgpu/gk20a/platform_vgpu_tegra.c
index b260b3ac..dc898226 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_vgpu_tegra.c
+++ b/drivers/gpu/nvgpu/gk20a/platform_vgpu_tegra.c
@@ -49,6 +49,7 @@ static int gk20a_tegra_probe(struct device *dev)
 
 struct gk20a_platform vgpu_tegra_platform = {
 	.has_syncpoints = true,
+	.aggressive_sync_destroy_thresh = 64,
 
 	/* power management configuration */
 	.can_railgate		= false,
-- 
cgit v1.2.2