gpu: nvgpu: support per-channel wdt timeouts

Replace the padding in nvgpu_channel_wdt_args with a timeout value in milliseconds, and add NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT to signify the existence of this new field. When the new flag is included in the value of wdt_status, the field is used to set a per-channel timeout to override the per-GPU default. Add NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP to disable the long debug dump when a timed out channel gets recovered by the watchdog. Printing the dump to serial console takes easily several seconds. (Note that there is NVGPU_TIMEOUT_FLAG_DISABLE_DUMP about ctxsw timeout separately for NVGPU_IOCTL_CHANNEL_SET_TIMEOUT_EX as well.) The behaviour of NVGPU_IOCTL_CHANNEL_WDT is changed so that either NVGPU_IOCTL_CHANNEL_ENABLE_WDT or NVGPU_IOCTL_CHANNEL_DISABLE_WDT has to be set. The old behaviour was that other values were silently ignored. The usage of the global default debugfs-controlled ch_wdt_timeout_ms is changed so that its value takes effect only for newly opened channels instead of in realtime. Also, zero value no longer means that the watchdog is disabled; there is a separate flag for that after all. gk20a_fifo_recover_tsg used to ignore the value of "verbose" when no engines were found. Correct this. Bug 1982826 Bug 1985845 Jira NVGPU-73 Change-Id: Iea6213a646a66cb7c631ed7d7c91d8c2ba8a92a4 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1510898 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Konsta Holtta <kholtta@nvidia.com> 2018-02-21 09:42:37 -0500
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2018-03-09 23:09:44 -0500
commit: cb6ed949e272f8ad753bf4ab1c0d20c35f31498b (patch)
tree: 16d0acad2430e77f9241abe93fae61937e317373
parent: 4f9368522ea18e3734798d2032b21c58dbb93a04 (diff)
7 files changed, 50 insertions, 30 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/channel.c b/drivers/gpu/nvgpu/common/linux/channel.c
index 8bfa4cfc..ea294738 100644
--- a/drivers/gpu/nvgpu/common/linux/channel.c
+++ b/drivers/gpu/nvgpu/common/linux/channel.c
@@ -753,7 +753,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
         */
        need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
                        (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
-                        c->wdt_enabled ||
+                        c->timeout.enabled ||
                        (g->can_railgate && !c->deterministic) ||
                        !skip_buffer_refcounting;
@@ -791,7 +791,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
                 */
                need_deferred_cleanup = !c->deterministic ||
                                        need_sync_framework ||
-                                        c->wdt_enabled ||
+                                        c->timeout.enabled ||
                                        (g->can_railgate &&
                                         !c->deterministic) ||
                                        !skip_buffer_refcounting;
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
index 0acaa61d..01355b78 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
@@ -319,10 +319,21 @@ static int gk20a_channel_cycle_stats_snapshot(struct channel_gk20a *ch,
 static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
                struct nvgpu_channel_wdt_args *args)
 {
-        if (args->wdt_status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT)
+        u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT |
-                ch->wdt_enabled = false;
+                        NVGPU_IOCTL_CHANNEL_ENABLE_WDT);
-        else if (args->wdt_status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT)
-                ch->wdt_enabled = true;
+        if (status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT)
+                ch->timeout.enabled = false;
+        else if (status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT)
+                ch->timeout.enabled = true;
+        else
+                return -EINVAL;
+        if (args->wdt_status & NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT)
+                ch->timeout.limit_ms = args->timeout_ms;
+        ch->timeout.debug_dump = (args->wdt_status &
+                        NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0;
        return 0;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 18878991..44a10659 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -443,7 +443,7 @@ u32 gk20a_ce_create_context(struct gk20a *g,
                err = -ENOMEM;
                goto end;
        }
-        ce_ctx->ch->wdt_enabled = false;
+        ce_ctx->ch->timeout.enabled = false;
        /* bind the channel to the vm */
        err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index f9b9c6e6..5cd7223f 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -61,8 +61,6 @@ static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
 static struct channel_gk20a_job *channel_gk20a_joblist_peek(
                struct channel_gk20a *c);
-static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch);
 /* allocate GPU channel */
 static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
 {
@@ -696,14 +694,19 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
        /* By default, channel is regular (non-TSG) channel */
        ch->tsgid = NVGPU_INVALID_TSG_ID;
-        /* reset timeout counter and update timestamp */
+        /* clear ctxsw timeout counter and update timestamp */
        ch->timeout_accumulated_ms = 0;
        ch->timeout_gpfifo_get = 0;
        /* set gr host default timeout */
        ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g);
        ch->timeout_debug_dump = true;
        ch->has_timedout = false;
-        ch->wdt_enabled = true;
+        /* init kernel watchdog timeout */
+        ch->timeout.enabled = true;
+        ch->timeout.limit_ms = g->ch_wdt_timeout_ms;
+        ch->timeout.debug_dump = true;
        ch->obj_class = 0;
        ch->subctx_id = 0;
        ch->runqueue_sel = 0;
@@ -1166,10 +1169,10 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
                }
        }
-        if (!c->g->timeouts_enabled || !c->wdt_enabled)
+        if (!c->g->timeouts_enabled || !c->timeout.enabled)
                acquire_timeout = 0;
        else
-                acquire_timeout = gk20a_get_channel_watchdog_timeout(c);
+                acquire_timeout = c->timeout.limit_ms;
        err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,
                                        c->gpfifo.entry_num,
@@ -1265,11 +1268,6 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
                ch->timeout_accumulated_ms > ch->timeout_ms_max;
 }
-static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
-{
-        return ch->g->ch_wdt_timeout_ms;
-}
 u32 nvgpu_get_gp_free_count(struct channel_gk20a *c)
 {
        update_gp_get(c->g, c);
@@ -1282,7 +1280,7 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
        ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch);
        ch->timeout.running = true;
        nvgpu_timeout_init(ch->g, &ch->timeout.timer,
-                        gk20a_get_channel_watchdog_timeout(ch),
+                        ch->timeout.limit_ms,
                        NVGPU_TIMER_CPU_TIMER);
 }
@@ -1303,10 +1301,10 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
 */
 static void gk20a_channel_timeout_start(struct channel_gk20a *ch)
 {
-        if (!ch->g->timeouts_enabled || !gk20a_get_channel_watchdog_timeout(ch))
+        if (!ch->g->timeouts_enabled)
                return;
-        if (!ch->wdt_enabled)
+        if (!ch->timeout.enabled)
                return;
        nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
@@ -1425,11 +1423,13 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
        nvgpu_err(g, "Job on channel %d timed out",
                  ch->chid);
-        gk20a_debug_dump(g);
+        /* force reset calls gk20a_debug_dump but not this */
-        gk20a_gr_debug_dump(g);
+        if (ch->timeout.debug_dump)
+                gk20a_gr_debug_dump(g);
        g->ops.fifo.force_reset_ch(ch,
-                NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, true);
+                NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
+                ch->timeout.debug_dump);
 }
 /**
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index edb645b5..947b8913 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -96,11 +96,17 @@ struct channel_gk20a_joblist {
 };
 struct channel_gk20a_timeout {
+        /* lock protects the running timer state */
        struct nvgpu_raw_spinlock lock;
        struct nvgpu_timeout timer;
        bool running;
        u32 gp_get;
        u64 pb_get;
+        /* lock not needed */
+        u32 limit_ms;
+        bool enabled;
+        bool debug_dump;
 };
 /*
@@ -167,7 +173,6 @@ struct channel_gk20a {
        struct nvgpu_semaphore_int *hw_sema;
        int chid;
-        bool wdt_enabled;
        nvgpu_atomic_t bound;
        bool vpr;
        bool deterministic;
@@ -203,7 +208,9 @@ struct channel_gk20a {
        u32 timeout_accumulated_ms;
        u32 timeout_gpfifo_get;
+        /* kernel watchdog to kill stuck jobs */
        struct channel_gk20a_timeout timeout;
        /* for job cleanup handling in the background worker */
        struct nvgpu_list_node worker_item;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 258006f9..96317520 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1808,7 +1808,7 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
        else {
                struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
-                if (gk20a_fifo_error_tsg(g, tsg))
+                if (gk20a_fifo_error_tsg(g, tsg) && verbose)
                        gk20a_debug_dump(g);
                gk20a_fifo_abort_tsg(g, tsgid, false);
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index cf75595a..8a578102 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -1577,13 +1577,15 @@ struct nvgpu_cycle_stats_snapshot_args {
 #define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH  1
 #define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH  2
-/* disable watchdog per-channel */
+/* configure watchdog per-channel */
 struct nvgpu_channel_wdt_args {
        __u32 wdt_status;
-        __u32 padding;
+        __u32 timeout_ms;
 };
-#define NVGPU_IOCTL_CHANNEL_DISABLE_WDT         1
+#define NVGPU_IOCTL_CHANNEL_DISABLE_WDT           (1 << 0)
-#define NVGPU_IOCTL_CHANNEL_ENABLE_WDT          2
+#define NVGPU_IOCTL_CHANNEL_ENABLE_WDT            (1 << 1)
+#define NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT  (1 << 2)
+#define NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP (1 << 3)
 /*
 * Interleaving channels in a runlist is an approach to improve
author	Konsta Holtta <kholtta@nvidia.com>	2018-02-21 09:42:37 -0500
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2018-03-09 23:09:44 -0500
commit	cb6ed949e272f8ad753bf4ab1c0d20c35f31498b (patch)
tree	16d0acad2430e77f9241abe93fae61937e317373
parent	4f9368522ea18e3734798d2032b21c58dbb93a04 (diff)

diff --git a/drivers/gpu/nvgpu/common/linux/channel.c b/drivers/gpu/nvgpu/common/linux/channel.c index 8bfa4cfc..ea294738 100644 --- a/drivers/gpu/nvgpu/common/linux/channel.c +++ b/drivers/gpu/nvgpu/common/linux/channel.c
@@ -753,7 +753,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
753	*/	753	*/
754	need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) \|\|	754	need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) \|\|
755	(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) \|\|	755	(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) \|\|
756	c->wdt_enabled \|\|	756	c->timeout.enabled \|\|
757	(g->can_railgate && !c->deterministic) \|\|	757	(g->can_railgate && !c->deterministic) \|\|
758	!skip_buffer_refcounting;	758	!skip_buffer_refcounting;
759		759
@@ -791,7 +791,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
791	*/	791	*/
792	need_deferred_cleanup = !c->deterministic \|\|	792	need_deferred_cleanup = !c->deterministic \|\|
793	need_sync_framework \|\|	793	need_sync_framework \|\|
794	c->wdt_enabled \|\|	794	c->timeout.enabled \|\|
795	(g->can_railgate &&	795	(g->can_railgate &&
796	!c->deterministic) \|\|	796	!c->deterministic) \|\|
797	!skip_buffer_refcounting;	797	!skip_buffer_refcounting;


diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c index 0acaa61d..01355b78 100644 --- a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c +++ b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
@@ -319,10 +319,21 @@ static int gk20a_channel_cycle_stats_snapshot(struct channel_gk20a *ch,
319	static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,	319	static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
320	struct nvgpu_channel_wdt_args *args)	320	struct nvgpu_channel_wdt_args *args)
321	{	321	{
322	if (args->wdt_status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT)	322	u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT \|
323	ch->wdt_enabled = false;	323	NVGPU_IOCTL_CHANNEL_ENABLE_WDT);
324	else if (args->wdt_status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT)	324
325	ch->wdt_enabled = true;	325	if (status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT)
		326	ch->timeout.enabled = false;
		327	else if (status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT)
		328	ch->timeout.enabled = true;
		329	else
		330	return -EINVAL;
		331
		332	if (args->wdt_status & NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT)
		333	ch->timeout.limit_ms = args->timeout_ms;
		334
		335	ch->timeout.debug_dump = (args->wdt_status &
		336	NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0;
326		337
327	return 0;	338	return 0;
328	}	339	}


diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index 18878991..44a10659 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -443,7 +443,7 @@ u32 gk20a_ce_create_context(struct gk20a *g,
443	err = -ENOMEM;	443	err = -ENOMEM;
444	goto end;	444	goto end;
445	}	445	}
446	ce_ctx->ch->wdt_enabled = false;	446	ce_ctx->ch->timeout.enabled = false;
447		447
448	/* bind the channel to the vm */	448	/* bind the channel to the vm */
449	err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);	449	err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);


diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index f9b9c6e6..5cd7223f 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -61,8 +61,6 @@ static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
61	static struct channel_gk20a_job *channel_gk20a_joblist_peek(	61	static struct channel_gk20a_job *channel_gk20a_joblist_peek(
62	struct channel_gk20a *c);	62	struct channel_gk20a *c);
63		63
64	static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch);
65
66	/* allocate GPU channel */	64	/* allocate GPU channel */
67	static struct channel_gk20a allocate_channel(struct fifo_gk20a f)	65	static struct channel_gk20a allocate_channel(struct fifo_gk20a f)
68	{	66	{
@@ -696,14 +694,19 @@ struct channel_gk20a gk20a_open_new_channel(struct gk20a g,
696	/* By default, channel is regular (non-TSG) channel */	694	/* By default, channel is regular (non-TSG) channel */
697	ch->tsgid = NVGPU_INVALID_TSG_ID;	695	ch->tsgid = NVGPU_INVALID_TSG_ID;
698		696
699	/* reset timeout counter and update timestamp */	697	/* clear ctxsw timeout counter and update timestamp */
700	ch->timeout_accumulated_ms = 0;	698	ch->timeout_accumulated_ms = 0;
701	ch->timeout_gpfifo_get = 0;	699	ch->timeout_gpfifo_get = 0;
702	/* set gr host default timeout */	700	/* set gr host default timeout */
703	ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g);	701	ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g);
704	ch->timeout_debug_dump = true;	702	ch->timeout_debug_dump = true;
705	ch->has_timedout = false;	703	ch->has_timedout = false;
706	ch->wdt_enabled = true;	704
		705	/* init kernel watchdog timeout */
		706	ch->timeout.enabled = true;
		707	ch->timeout.limit_ms = g->ch_wdt_timeout_ms;
		708	ch->timeout.debug_dump = true;
		709
707	ch->obj_class = 0;	710	ch->obj_class = 0;
708	ch->subctx_id = 0;	711	ch->subctx_id = 0;
709	ch->runqueue_sel = 0;	712	ch->runqueue_sel = 0;
@@ -1166,10 +1169,10 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
1166	}	1169	}
1167	}	1170	}
1168		1171
1169	if (!c->g->timeouts_enabled \|\| !c->wdt_enabled)	1172	if (!c->g->timeouts_enabled \|\| !c->timeout.enabled)
1170	acquire_timeout = 0;	1173	acquire_timeout = 0;
1171	else	1174	else
1172	acquire_timeout = gk20a_get_channel_watchdog_timeout(c);	1175	acquire_timeout = c->timeout.limit_ms;
1173		1176
1174	err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,	1177	err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,
1175	c->gpfifo.entry_num,	1178	c->gpfifo.entry_num,
@@ -1265,11 +1268,6 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
1265	ch->timeout_accumulated_ms > ch->timeout_ms_max;	1268	ch->timeout_accumulated_ms > ch->timeout_ms_max;
1266	}	1269	}
1267		1270
1268	static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
1269	{
1270	return ch->g->ch_wdt_timeout_ms;
1271	}
1272
1273	u32 nvgpu_get_gp_free_count(struct channel_gk20a *c)	1271	u32 nvgpu_get_gp_free_count(struct channel_gk20a *c)
1274	{	1272	{
1275	update_gp_get(c->g, c);	1273	update_gp_get(c->g, c);
@@ -1282,7 +1280,7 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
1282	ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch);	1280	ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch);
1283	ch->timeout.running = true;	1281	ch->timeout.running = true;
1284	nvgpu_timeout_init(ch->g, &ch->timeout.timer,	1282	nvgpu_timeout_init(ch->g, &ch->timeout.timer,
1285	gk20a_get_channel_watchdog_timeout(ch),	1283	ch->timeout.limit_ms,
1286	NVGPU_TIMER_CPU_TIMER);	1284	NVGPU_TIMER_CPU_TIMER);
1287	}	1285	}
1288		1286
@@ -1303,10 +1301,10 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
1303	*/	1301	*/
1304	static void gk20a_channel_timeout_start(struct channel_gk20a *ch)	1302	static void gk20a_channel_timeout_start(struct channel_gk20a *ch)
1305	{	1303	{
1306	if (!ch->g->timeouts_enabled \|\| !gk20a_get_channel_watchdog_timeout(ch))	1304	if (!ch->g->timeouts_enabled)
1307	return;	1305	return;
1308		1306
1309	if (!ch->wdt_enabled)	1307	if (!ch->timeout.enabled)
1310	return;	1308	return;
1311		1309
1312	nvgpu_raw_spinlock_acquire(&ch->timeout.lock);	1310	nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
@@ -1425,11 +1423,13 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
1425	nvgpu_err(g, "Job on channel %d timed out",	1423	nvgpu_err(g, "Job on channel %d timed out",
1426	ch->chid);	1424	ch->chid);
1427		1425
1428	gk20a_debug_dump(g);	1426	/* force reset calls gk20a_debug_dump but not this */
1429	gk20a_gr_debug_dump(g);	1427	if (ch->timeout.debug_dump)
		1428	gk20a_gr_debug_dump(g);
1430		1429
1431	g->ops.fifo.force_reset_ch(ch,	1430	g->ops.fifo.force_reset_ch(ch,
1432	NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, true);	1431	NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
		1432	ch->timeout.debug_dump);
1433	}	1433	}
1434		1434
1435	/**	1435	/**


diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index edb645b5..947b8913 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -96,11 +96,17 @@ struct channel_gk20a_joblist {
96	};	96	};
97		97
98	struct channel_gk20a_timeout {	98	struct channel_gk20a_timeout {
		99	/* lock protects the running timer state */
99	struct nvgpu_raw_spinlock lock;	100	struct nvgpu_raw_spinlock lock;
100	struct nvgpu_timeout timer;	101	struct nvgpu_timeout timer;
101	bool running;	102	bool running;
102	u32 gp_get;	103	u32 gp_get;
103	u64 pb_get;	104	u64 pb_get;
		105
		106	/* lock not needed */
		107	u32 limit_ms;
		108	bool enabled;
		109	bool debug_dump;
104	};	110	};
105		111
106	/*	112	/*
@@ -167,7 +173,6 @@ struct channel_gk20a {
167	struct nvgpu_semaphore_int *hw_sema;	173	struct nvgpu_semaphore_int *hw_sema;
168		174
169	int chid;	175	int chid;
170	bool wdt_enabled;
171	nvgpu_atomic_t bound;	176	nvgpu_atomic_t bound;
172	bool vpr;	177	bool vpr;
173	bool deterministic;	178	bool deterministic;
@@ -203,7 +208,9 @@ struct channel_gk20a {
203	u32 timeout_accumulated_ms;	208	u32 timeout_accumulated_ms;
204	u32 timeout_gpfifo_get;	209	u32 timeout_gpfifo_get;
205		210
		211	/* kernel watchdog to kill stuck jobs */
206	struct channel_gk20a_timeout timeout;	212	struct channel_gk20a_timeout timeout;
		213
207	/* for job cleanup handling in the background worker */	214	/* for job cleanup handling in the background worker */
208	struct nvgpu_list_node worker_item;	215	struct nvgpu_list_node worker_item;
209		216


diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index 258006f9..96317520 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1808,7 +1808,7 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
1808	else {	1808	else {
1809	struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];	1809	struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
1810		1810
1811	if (gk20a_fifo_error_tsg(g, tsg))	1811	if (gk20a_fifo_error_tsg(g, tsg) && verbose)
1812	gk20a_debug_dump(g);	1812	gk20a_debug_dump(g);
1813		1813
1814	gk20a_fifo_abort_tsg(g, tsgid, false);	1814	gk20a_fifo_abort_tsg(g, tsgid, false);


diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h index cf75595a..8a578102 100644 --- a/include/uapi/linux/nvgpu.h +++ b/include/uapi/linux/nvgpu.h
@@ -1577,13 +1577,15 @@ struct nvgpu_cycle_stats_snapshot_args {
1577	#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH 1	1577	#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH 1
1578	#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH 2	1578	#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH 2
1579		1579
1580	/* disable watchdog per-channel */	1580	/* configure watchdog per-channel */
1581	struct nvgpu_channel_wdt_args {	1581	struct nvgpu_channel_wdt_args {
1582	__u32 wdt_status;	1582	__u32 wdt_status;
1583	__u32 padding;	1583	__u32 timeout_ms;
1584	};	1584	};
1585	#define NVGPU_IOCTL_CHANNEL_DISABLE_WDT 1	1585	#define NVGPU_IOCTL_CHANNEL_DISABLE_WDT (1 << 0)
1586	#define NVGPU_IOCTL_CHANNEL_ENABLE_WDT 2	1586	#define NVGPU_IOCTL_CHANNEL_ENABLE_WDT (1 << 1)
		1587	#define NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT (1 << 2)
		1588	#define NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP (1 << 3)
1587		1589
1588	/*	1590	/*
1589	* Interleaving channels in a runlist is an approach to improve	1591	* Interleaving channels in a runlist is an approach to improve