From cb6ed949e272f8ad753bf4ab1c0d20c35f31498b Mon Sep 17 00:00:00 2001
From: Konsta Holtta <kholtta@nvidia.com>
Date: Wed, 21 Feb 2018 16:42:37 +0200
Subject: gpu: nvgpu: support per-channel wdt timeouts

Replace the padding in nvgpu_channel_wdt_args with a timeout value in
milliseconds, and add NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT to
signify the existence of this new field. When the new flag is included
in the value of wdt_status, the field is used to set a per-channel
timeout to override the per-GPU default.

Add NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP to disable the long debug
dump when a timed out channel gets recovered by the watchdog. Printing
the dump to serial console takes easily several seconds. (Note that
there is NVGPU_TIMEOUT_FLAG_DISABLE_DUMP about ctxsw timeout separately
for NVGPU_IOCTL_CHANNEL_SET_TIMEOUT_EX as well.)

The behaviour of NVGPU_IOCTL_CHANNEL_WDT is changed so that either
NVGPU_IOCTL_CHANNEL_ENABLE_WDT or NVGPU_IOCTL_CHANNEL_DISABLE_WDT has to
be set. The old behaviour was that other values were silently ignored.

The usage of the global default debugfs-controlled ch_wdt_timeout_ms is
changed so that its value takes effect only for newly opened channels
instead of in realtime. Also, zero value no longer means that the
watchdog is disabled; there is a separate flag for that after all.

gk20a_fifo_recover_tsg used to ignore the value of "verbose" when no
engines were found. Correct this.

Bug 1982826
Bug 1985845
Jira NVGPU-73

Change-Id: Iea6213a646a66cb7c631ed7d7c91d8c2ba8a92a4
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1510898
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/linux/channel.c       |  4 +--
 drivers/gpu/nvgpu/common/linux/ioctl_channel.c | 19 +++++++++++---
 drivers/gpu/nvgpu/gk20a/ce2_gk20a.c            |  2 +-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c        | 34 +++++++++++++-------------
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h        |  9 ++++++-
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c           |  2 +-
 6 files changed, 44 insertions(+), 26 deletions(-)

(limited to 'drivers/gpu/nvgpu')

diff --git a/drivers/gpu/nvgpu/common/linux/channel.c b/drivers/gpu/nvgpu/common/linux/channel.c
index 8bfa4cfc..ea294738 100644
--- a/drivers/gpu/nvgpu/common/linux/channel.c
+++ b/drivers/gpu/nvgpu/common/linux/channel.c
@@ -753,7 +753,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	 */
 	need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
 			(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
-			c->wdt_enabled ||
+			c->timeout.enabled ||
 			(g->can_railgate && !c->deterministic) ||
 			!skip_buffer_refcounting;
 
@@ -791,7 +791,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		 */
 		need_deferred_cleanup = !c->deterministic ||
 					need_sync_framework ||
-					c->wdt_enabled ||
+					c->timeout.enabled ||
 					(g->can_railgate &&
 					 !c->deterministic) ||
 					!skip_buffer_refcounting;
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
index 0acaa61d..01355b78 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
@@ -319,10 +319,21 @@ static int gk20a_channel_cycle_stats_snapshot(struct channel_gk20a *ch,
 static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
 		struct nvgpu_channel_wdt_args *args)
 {
-	if (args->wdt_status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT)
-		ch->wdt_enabled = false;
-	else if (args->wdt_status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT)
-		ch->wdt_enabled = true;
+	u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT |
+			NVGPU_IOCTL_CHANNEL_ENABLE_WDT);
+
+	if (status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT)
+		ch->timeout.enabled = false;
+	else if (status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT)
+		ch->timeout.enabled = true;
+	else
+		return -EINVAL;
+
+	if (args->wdt_status & NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT)
+		ch->timeout.limit_ms = args->timeout_ms;
+
+	ch->timeout.debug_dump = (args->wdt_status &
+			NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0;
 
 	return 0;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 18878991..44a10659 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -443,7 +443,7 @@ u32 gk20a_ce_create_context(struct gk20a *g,
 		err = -ENOMEM;
 		goto end;
 	}
-	ce_ctx->ch->wdt_enabled = false;
+	ce_ctx->ch->timeout.enabled = false;
 
 	/* bind the channel to the vm */
 	err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index f9b9c6e6..5cd7223f 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -61,8 +61,6 @@ static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
 static struct channel_gk20a_job *channel_gk20a_joblist_peek(
 		struct channel_gk20a *c);
 
-static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch);
-
 /* allocate GPU channel */
 static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
 {
@@ -696,14 +694,19 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
 	/* By default, channel is regular (non-TSG) channel */
 	ch->tsgid = NVGPU_INVALID_TSG_ID;
 
-	/* reset timeout counter and update timestamp */
+	/* clear ctxsw timeout counter and update timestamp */
 	ch->timeout_accumulated_ms = 0;
 	ch->timeout_gpfifo_get = 0;
 	/* set gr host default timeout */
 	ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g);
 	ch->timeout_debug_dump = true;
 	ch->has_timedout = false;
-	ch->wdt_enabled = true;
+
+	/* init kernel watchdog timeout */
+	ch->timeout.enabled = true;
+	ch->timeout.limit_ms = g->ch_wdt_timeout_ms;
+	ch->timeout.debug_dump = true;
+
 	ch->obj_class = 0;
 	ch->subctx_id = 0;
 	ch->runqueue_sel = 0;
@@ -1166,10 +1169,10 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
 		}
 	}
 
-	if (!c->g->timeouts_enabled || !c->wdt_enabled)
+	if (!c->g->timeouts_enabled || !c->timeout.enabled)
 		acquire_timeout = 0;
 	else
-		acquire_timeout = gk20a_get_channel_watchdog_timeout(c);
+		acquire_timeout = c->timeout.limit_ms;
 
 	err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,
 					c->gpfifo.entry_num,
@@ -1265,11 +1268,6 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
 		ch->timeout_accumulated_ms > ch->timeout_ms_max;
 }
 
-static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
-{
-	return ch->g->ch_wdt_timeout_ms;
-}
-
 u32 nvgpu_get_gp_free_count(struct channel_gk20a *c)
 {
 	update_gp_get(c->g, c);
@@ -1282,7 +1280,7 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
 	ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch);
 	ch->timeout.running = true;
 	nvgpu_timeout_init(ch->g, &ch->timeout.timer,
-			gk20a_get_channel_watchdog_timeout(ch),
+			ch->timeout.limit_ms,
 			NVGPU_TIMER_CPU_TIMER);
 }
 
@@ -1303,10 +1301,10 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
  */
 static void gk20a_channel_timeout_start(struct channel_gk20a *ch)
 {
-	if (!ch->g->timeouts_enabled || !gk20a_get_channel_watchdog_timeout(ch))
+	if (!ch->g->timeouts_enabled)
 		return;
 
-	if (!ch->wdt_enabled)
+	if (!ch->timeout.enabled)
 		return;
 
 	nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
@@ -1425,11 +1423,13 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
 	nvgpu_err(g, "Job on channel %d timed out",
 		  ch->chid);
 
-	gk20a_debug_dump(g);
-	gk20a_gr_debug_dump(g);
+	/* force reset calls gk20a_debug_dump but not this */
+	if (ch->timeout.debug_dump)
+		gk20a_gr_debug_dump(g);
 
 	g->ops.fifo.force_reset_ch(ch,
-		NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, true);
+		NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
+		ch->timeout.debug_dump);
 }
 
 /**
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index edb645b5..947b8913 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -96,11 +96,17 @@ struct channel_gk20a_joblist {
 };
 
 struct channel_gk20a_timeout {
+	/* lock protects the running timer state */
 	struct nvgpu_raw_spinlock lock;
 	struct nvgpu_timeout timer;
 	bool running;
 	u32 gp_get;
 	u64 pb_get;
+
+	/* lock not needed */
+	u32 limit_ms;
+	bool enabled;
+	bool debug_dump;
 };
 
 /*
@@ -167,7 +173,6 @@ struct channel_gk20a {
 	struct nvgpu_semaphore_int *hw_sema;
 
 	int chid;
-	bool wdt_enabled;
 	nvgpu_atomic_t bound;
 	bool vpr;
 	bool deterministic;
@@ -203,7 +208,9 @@ struct channel_gk20a {
 	u32 timeout_accumulated_ms;
 	u32 timeout_gpfifo_get;
 
+	/* kernel watchdog to kill stuck jobs */
 	struct channel_gk20a_timeout timeout;
+
 	/* for job cleanup handling in the background worker */
 	struct nvgpu_list_node worker_item;
 
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 258006f9..96317520 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1808,7 +1808,7 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
 	else {
 		struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
 
-		if (gk20a_fifo_error_tsg(g, tsg))
+		if (gk20a_fifo_error_tsg(g, tsg) && verbose)
 			gk20a_debug_dump(g);
 
 		gk20a_fifo_abort_tsg(g, tsgid, false);
-- 
cgit v1.2.2