From 7680fd689ecf7d11bf2dfdba41dc2f33cde2bbe7 Mon Sep 17 00:00:00 2001
From: Konsta Holtta <kholtta@nvidia.com>
Date: Tue, 16 May 2017 13:47:58 +0300
Subject: gpu: nvgpu: hold power ref for deterministic channels

To support deterministic channels even with platforms where railgating
is supported, have each deterministic-marked channel hold a power
reference during their lifetime, and skip taking power refs for jobs in
submit path for those.

Previously, railgating blocked deterministic submits in general because
of gk20a_busy()/gk20a_idle() calls in submit path possibly taking time
and more significantly because the gpu may need turning on which takes a
nondeterministic and long amount of time.

As an exception, gk20a_do_idle() can still block deterministic submits
until gk20a_do_unidle() is called. Add a rwsem to guard this. VPR resize
needs do_idle, which conflicts with deterministic channels' requirement
to keep the GPU on. This is documented in the ioctl header now.

Make NVGPU_GPU_FLAGS_SUPPORT_DETERMINISTIC_SUBMIT_NO_JOBTRACKING always
set in the gpu characteristics now that it's supported. The only thing
left now blocking NVGPU_GPU_FLAGS_SUPPORT_DETERMINISTIC_SUBMIT_FULL is
the sync framework.

Make the channel debug dump show which channels are deterministic.

Bug 200291300
Jira NVGPU-70

Change-Id: I47b6f3a8517cd6e4255f6ca2855e3dd912e4f5f3
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: http://git-master/r/1483038
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/linux/driver_common.c |   1 +
 drivers/gpu/nvgpu/common/linux/module.c        |   9 ++
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c        | 169 ++++++++++++++++++++++---
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h        |   3 +
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c           |   6 +-
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.h           |   1 +
 drivers/gpu/nvgpu/gk20a/gk20a.c                |  19 ++-
 drivers/gpu/nvgpu/gk20a/gk20a.h                |   6 +
 8 files changed, 185 insertions(+), 29 deletions(-)

(limited to 'drivers/gpu/nvgpu')

diff --git a/drivers/gpu/nvgpu/common/linux/driver_common.c b/drivers/gpu/nvgpu/common/linux/driver_common.c
index bd9a4e77..a00880ed 100644
--- a/drivers/gpu/nvgpu/common/linux/driver_common.c
+++ b/drivers/gpu/nvgpu/common/linux/driver_common.c
@@ -39,6 +39,7 @@ static void nvgpu_init_vars(struct gk20a *g)
 	gk20a_init_gr(g);
 
 	init_rwsem(&g->busy_lock);
+	init_rwsem(&g->deterministic_busy);
 
 	nvgpu_spinlock_init(&g->mc_enable_lock);
 
diff --git a/drivers/gpu/nvgpu/common/linux/module.c b/drivers/gpu/nvgpu/common/linux/module.c
index 34a0ded6..cbad3993 100644
--- a/drivers/gpu/nvgpu/common/linux/module.c
+++ b/drivers/gpu/nvgpu/common/linux/module.c
@@ -298,6 +298,12 @@ int __gk20a_do_idle(struct device *dev, bool force_reset)
 	bool is_railgated;
 	int err = 0;
 
+	/*
+	 * Hold back deterministic submits and changes to deterministic
+	 * channels - this must be outside the power busy locks.
+	 */
+	gk20a_channel_deterministic_idle(g);
+
 	/* acquire busy lock to block other busy() calls */
 	down_write(&g->busy_lock);
 
@@ -403,6 +409,7 @@ fail_drop_usage_count:
 fail_timeout:
 	nvgpu_mutex_release(&platform->railgate_lock);
 	up_write(&g->busy_lock);
+	gk20a_channel_deterministic_unidle(g);
 	return -EBUSY;
 }
 
@@ -456,6 +463,8 @@ int __gk20a_do_unidle(struct device *dev)
 	nvgpu_mutex_release(&platform->railgate_lock);
 	up_write(&g->busy_lock);
 
+	gk20a_channel_deterministic_unidle(g);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 49d83069..90202fd7 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -575,8 +575,15 @@ unbind:
 	g->ops.fifo.unbind_channel(ch);
 	g->ops.fifo.free_inst(g, ch);
 
+	/* put back the channel-wide submit ref from init */
+	if (ch->deterministic) {
+		down_read(&g->deterministic_busy);
+		ch->deterministic = false;
+		gk20a_idle(g);
+		up_read(&g->deterministic_busy);
+	}
+
 	ch->vpr = false;
-	ch->deterministic = false;
 	ch->vm = NULL;
 
 	WARN_ON(ch->sync);
@@ -1228,22 +1235,42 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
 	if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_VPR_ENABLED)
 		c->vpr = true;
 
-	if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_DETERMINISTIC)
+	if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_DETERMINISTIC) {
+		down_read(&g->deterministic_busy);
+		/*
+		 * Railgating isn't deterministic; instead of disallowing
+		 * railgating globally, take a power refcount for this
+		 * channel's lifetime. The gk20a_idle() pair for this happens
+		 * when the channel gets freed.
+		 *
+		 * Deterministic flag and this busy must be atomic within the
+		 * busy lock.
+		 */
+		err = gk20a_busy(g);
+		if (err) {
+			up_read(&g->deterministic_busy);
+			return err;
+		}
+
 		c->deterministic = true;
+		up_read(&g->deterministic_busy);
+	}
 
 	/* an address space needs to have been bound at this point. */
 	if (!gk20a_channel_as_bound(c)) {
 		nvgpu_err(g,
 			    "not bound to an address space at time of gpfifo"
 			    " allocation.");
-		return -EINVAL;
+		err = -EINVAL;
+		goto clean_up_idle;
 	}
 	ch_vm = c->vm;
 
 	if (c->gpfifo.mem.size) {
 		nvgpu_err(g, "channel %d :"
 			   "gpfifo already allocated", c->hw_chid);
-		return -EEXIST;
+		err = -EEXIST;
+		goto clean_up_idle;
 	}
 
 	err = nvgpu_dma_alloc_map_sys(ch_vm,
@@ -1336,6 +1363,13 @@ clean_up_unmap:
 	nvgpu_dma_unmap_free(ch_vm, &c->gpfifo.mem);
 clean_up:
 	memset(&c->gpfifo, 0, sizeof(struct gpfifo_desc));
+clean_up_idle:
+	if (c->deterministic) {
+		down_read(&g->deterministic_busy);
+		gk20a_idle(g);
+		c->deterministic = false;
+		up_read(&g->deterministic_busy);
+	}
 	nvgpu_err(g, "fail");
 	return err;
 }
@@ -2089,7 +2123,13 @@ static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
 
 		channel_gk20a_free_job(c, job);
 		job_finished = 1;
-		gk20a_idle(g);
+
+		/*
+		 * Deterministic channels have a channel-wide power reference;
+		 * for others, there's one per submit.
+		 */
+		if (!c->deterministic)
+			gk20a_idle(g);
 
 		if (!clean_all) {
 			/* Timeout isn't supported here so don't touch it. */
@@ -2457,7 +2497,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	 * Job tracking is necessary for any of the following conditions:
 	 *  - pre- or post-fence functionality
 	 *  - channel wdt
-	 *  - GPU rail-gating
+	 *  - GPU rail-gating with non-deterministic channels
 	 *  - buffer refcounting
 	 *
 	 * If none of the conditions are met, then job tracking is not
@@ -2467,7 +2507,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
 			(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
 			c->wdt_enabled ||
-			g->can_railgate ||
+			(g->can_railgate && !c->deterministic) ||
 			!skip_buffer_refcounting;
 
 	if (need_job_tracking) {
@@ -2495,7 +2535,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		 *   behavior of the clean-up operation non-deterministic
 		 *   (should not be performed in the submit path)
 		 * - channel wdt
-		 * - GPU rail-gating
+		 * - GPU rail-gating with non-deterministic channels
 		 * - buffer refcounting
 		 *
 		 * If none of the conditions are met, then deferred clean-up
@@ -2505,7 +2545,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		need_deferred_cleanup = !c->deterministic ||
 					need_sync_framework ||
 					c->wdt_enabled ||
-					g->can_railgate ||
+					(g->can_railgate &&
+					 !c->deterministic) ||
 					!skip_buffer_refcounting;
 
 		/*
@@ -2515,12 +2556,20 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		if (c->deterministic && need_deferred_cleanup)
 			return -EINVAL;
 
-		/* released by job cleanup via syncpt or sema interrupt */
-		err = gk20a_busy(g);
-		if (err) {
-			nvgpu_err(g, "failed to host gk20a to submit gpfifo, process %s",
-				current->comm);
-			return err;
+		if (!c->deterministic) {
+			/*
+			 * Get a power ref unless this is a deterministic
+			 * channel that holds them during the channel lifetime.
+			 * This one is released by gk20a_channel_clean_up_jobs,
+			 * via syncpt or sema interrupt, whichever is used.
+			 */
+			err = gk20a_busy(g);
+			if (err) {
+				nvgpu_err(g,
+					"failed to host gk20a to submit gpfifo, process %s",
+					current->comm);
+				return err;
+			}
 		}
 
 		if (!need_deferred_cleanup) {
@@ -2529,6 +2578,11 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		}
 	}
 
+
+	/* Grab access to HW to deal with do_idle */
+	if (c->deterministic)
+		down_read(&g->deterministic_busy);
+
 	trace_gk20a_channel_submit_gpfifo(g->name,
 					  c->hw_chid,
 					  num_entries,
@@ -2601,6 +2655,10 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 
 	g->ops.fifo.userd_gp_put(g, c);
 
+	/* No hw access beyond this point */
+	if (c->deterministic)
+		up_read(&g->deterministic_busy);
+
 	trace_gk20a_channel_submitted_gpfifo(g->name,
 				c->hw_chid,
 				num_entries,
@@ -2622,11 +2680,90 @@ clean_up:
 	gk20a_dbg_fn("fail");
 	gk20a_fence_put(pre_fence);
 	gk20a_fence_put(post_fence);
-	if (need_deferred_cleanup)
+	if (c->deterministic)
+		up_read(&g->deterministic_busy);
+	else if (need_deferred_cleanup)
 		gk20a_idle(g);
+
 	return err;
 }
 
+/*
+ * Stop deterministic channel activity for do_idle() when power needs to go off
+ * momentarily but deterministic channels keep power refs for potentially a
+ * long time.
+ *
+ * Takes write access on g->deterministic_busy.
+ *
+ * Must be paired with gk20a_channel_deterministic_unidle().
+ */
+void gk20a_channel_deterministic_idle(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	u32 chid;
+
+	/* Grab exclusive access to the hw to block new submits */
+	down_write(&g->deterministic_busy);
+
+	for (chid = 0; chid < f->num_channels; chid++) {
+		struct channel_gk20a *ch = &f->channel[chid];
+
+		if (!gk20a_channel_get(ch))
+			continue;
+
+		if (ch->deterministic) {
+			/*
+			 * Drop the power ref taken when setting deterministic
+			 * flag. deterministic_unidle will put this and the
+			 * channel ref back.
+			 *
+			 * Hold the channel ref: it must not get freed in
+			 * between. A race could otherwise result in lost
+			 * gk20a_busy() via unidle, and in unbalanced
+			 * gk20a_idle() via closing the channel.
+			 */
+			gk20a_idle(g);
+		} else {
+			/* Not interesting, carry on. */
+			gk20a_channel_put(ch);
+		}
+	}
+}
+
+/*
+ * Allow deterministic channel activity again for do_unidle().
+ *
+ * This releases write access on g->deterministic_busy.
+ */
+void gk20a_channel_deterministic_unidle(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	u32 chid;
+
+	for (chid = 0; chid < f->num_channels; chid++) {
+		struct channel_gk20a *ch = &f->channel[chid];
+
+		if (!gk20a_channel_get(ch))
+			continue;
+
+		/*
+		 * Deterministic state changes inside deterministic_busy lock,
+		 * which we took in deterministic_idle.
+		 */
+		if (ch->deterministic) {
+			if (gk20a_busy(g))
+				nvgpu_err(g, "cannot busy() again!");
+			/* Took this in idle() */
+			gk20a_channel_put(ch);
+		}
+
+		gk20a_channel_put(ch);
+	}
+
+	/* Release submits, new deterministic channels and frees */
+	up_write(&g->deterministic_busy);
+}
+
 int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 {
 	struct channel_gk20a *c = g->fifo.channel+chid;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 9872e1b2..ca042883 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -328,6 +328,9 @@ int gk20a_disable_channel_tsg(struct gk20a *g, struct channel_gk20a *ch);
 int gk20a_channel_suspend(struct gk20a *g);
 int gk20a_channel_resume(struct gk20a *g);
 
+void gk20a_channel_deterministic_idle(struct gk20a *g);
+void gk20a_channel_deterministic_unidle(struct gk20a *g);
+
 int nvgpu_channel_worker_init(struct gk20a *g);
 void nvgpu_channel_worker_deinit(struct gk20a *g);
 
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 5a571dc8..37e19ef8 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -3494,10 +3494,11 @@ void gk20a_dump_channel_status_ramfc(struct gk20a *g,
 	syncpointa = inst_mem[ram_fc_syncpointa_w()];
 	syncpointb = inst_mem[ram_fc_syncpointb_w()];
 
-	gk20a_debug_output(o, "%d-%s, pid %d, refs: %d: ", hw_chid,
+	gk20a_debug_output(o, "%d-%s, pid %d, refs %d%s: ", hw_chid,
 			g->name,
 			ch_state->pid,
-			ch_state->refs);
+			ch_state->refs,
+			ch_state->deterministic ? ", deterministic" : "");
 	gk20a_debug_output(o, "channel status: %s in use %s %s\n",
 			ccsr_channel_enable_v(channel) ? "" : "not",
 			gk20a_decode_ccsr_chan_status(status),
@@ -3576,6 +3577,7 @@ void gk20a_debug_dump_all_channel_status_ramfc(struct gk20a *g,
 
 		ch_state[chid]->pid = ch->pid;
 		ch_state[chid]->refs = atomic_read(&ch->ref_count);
+		ch_state[chid]->deterministic = ch->deterministic;
 		nvgpu_mem_rd_n(g, &ch->inst_block, 0,
 				&ch_state[chid]->inst_block[0],
 				ram_in_alloc_size_v());
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 228e5130..1566302f 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -216,6 +216,7 @@ static inline const char *gk20a_fifo_interleave_level_name(u32 interleave_level)
 struct ch_state {
 	int pid;
 	int refs;
+	bool deterministic;
 	u32 inst_block[0];
 };
 
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 8624d601..13635706 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -456,22 +456,19 @@ int gk20a_init_gpu_characteristics(struct gk20a *g)
 		gpu->flags |= NVGPU_GPU_FLAGS_HAS_SYNCPOINTS;
 
 	/*
-	 * Railgating needs job tracking which prevents fast submits. They're
-	 * supported otherwise, provided that the user doesn't request anything
-	 * that depends on job tracking. (Here, fast means strictly no
+	 * Fast submits are supported as long as the user doesn't request
+	 * anything that depends on job tracking. (Here, fast means strictly no
 	 * metadata, just the gpfifo contents are copied and gp_put updated).
 	 */
-	if (!g->can_railgate)
-		gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_DETERMINISTIC_SUBMIT_NO_JOBTRACKING;
+	gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_DETERMINISTIC_SUBMIT_NO_JOBTRACKING;
 
 	/*
-	 * Railgating and sync framework require deferred job cleanup which
-	 * prevents deterministic submits. They're supported otherwise,
-	 * provided that the user doesn't request anything that depends on
-	 * deferred cleanup.
+	 * Sync framework requires deferred job cleanup, wrapping syncs in FDs,
+	 * and other heavy stuff, which prevents deterministic submits. This is
+	 * supported otherwise, provided that the user doesn't request anything
+	 * that depends on deferred cleanup.
 	 */
-	if (!g->can_railgate
-	    && !gk20a_channel_sync_needs_sync_framework(g))
+	if (!gk20a_channel_sync_needs_sync_framework(g))
 		gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_DETERMINISTIC_SUBMIT_FULL;
 
 	gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_USERSPACE_MANAGED_AS;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 1d867912..79118fca 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -1025,6 +1025,12 @@ struct gk20a {
 	u32 log_trace;
 
 	struct rw_semaphore busy_lock;
+	/*
+	 * Guards access to hardware when usual gk20a_{busy,idle} are skipped
+	 * for submits and held for channel lifetime but dropped for an ongoing
+	 * gk20a_do_idle().
+	 */
+	struct rw_semaphore deterministic_busy;
 
 	struct nvgpu_falcon pmu_flcn;
 	struct nvgpu_falcon sec2_flcn;
-- 
cgit v1.2.2