From 613990cb391c74436384d63d12240221565011d5 Mon Sep 17 00:00:00 2001
From: Deepak Nibade <dnibade@nvidia.com>
Date: Mon, 31 Aug 2015 14:30:35 +0530
Subject: gpu: nvgpu: implement per-channel watchdog

Implement per-channel watchdog/timer as per below rules :
- start the timer while submitting first job on channel or if
  no timer is already running
- cancel the timer when job completes
- re-start the timer if there is any incomplete job left
  in the channel's queue
- trigger appropriate recovery method as part of timeout
  handling mechanism

Handle the timeout as per below :
- get timed out channel, and job data
- disable activity on all engines
- check if fence is really pending
- get information on failing engine
- if no engine is failing, just abort the channel
- if engine is failing, trigger the recovery

Also, add flag "ch_wdt_enabled" to enable/disable channel
watchdog mechanism. Watchdog can also be disabled using
global flag "timeouts_enabled"

Set the watchdog time to be 5s using macro
NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS

Bug 200133289

Change-Id: I401cf14dd34a210bc429f31bd5216a361edf1237
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: http://git-master/r/797072
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 124 +++++++++++++++++++++++++++++++-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h |  11 +++
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c    |   6 +-
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.h    |   5 ++
 drivers/gpu/nvgpu/gk20a/gk20a.c         |   2 +
 drivers/gpu/nvgpu/gk20a/gk20a.h         |   3 +
 6 files changed, 147 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index c18a4e5d..2dc8e9a0 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1472,6 +1472,14 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
 		ch->timeout_accumulated_ms > ch->timeout_ms_max;
 }
 
+static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
+{
+	if (ch->g->timeouts_enabled && ch->g->ch_wdt_enabled)
+		return NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS;
+	else
+		return (u32)MAX_SCHEDULE_TIMEOUT;
+}
+
 static u32 get_gp_free_count(struct channel_gk20a *c)
 {
 	update_gp_get(c->g, c);
@@ -1527,6 +1535,112 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
 	}
 }
 
+static void gk20a_channel_timeout_start(struct channel_gk20a *ch,
+		struct channel_gk20a_job *job)
+{
+	mutex_lock(&ch->timeout.lock);
+
+	if (ch->timeout.initialized) {
+		mutex_unlock(&ch->timeout.lock);
+		return;
+	}
+
+	ch->timeout.job = job;
+	ch->timeout.initialized = true;
+	schedule_delayed_work(&ch->timeout.wq,
+	       msecs_to_jiffies(gk20a_get_channel_watchdog_timeout(ch)));
+
+	mutex_unlock(&ch->timeout.lock);
+}
+
+static void gk20a_channel_timeout_stop(struct channel_gk20a *ch)
+{
+	mutex_lock(&ch->timeout.lock);
+
+	if (!ch->timeout.initialized) {
+		mutex_unlock(&ch->timeout.lock);
+		return;
+	}
+
+	ch->timeout.initialized = false;
+	cancel_delayed_work_sync(&ch->timeout.wq);
+
+	mutex_unlock(&ch->timeout.lock);
+}
+
+static void gk20a_channel_timeout_handler(struct work_struct *work)
+{
+	struct channel_gk20a_job *job;
+	struct gk20a *g;
+	struct channel_gk20a *ch;
+	struct channel_gk20a *failing_ch;
+	u32 engine_id;
+	int id = -1;
+	bool is_tsg = false;
+
+	ch = container_of(to_delayed_work(work), struct channel_gk20a,
+			timeout.wq);
+	ch = gk20a_channel_get(ch);
+	if (!ch)
+		return;
+
+	g = ch->g;
+
+	/* Need global lock since multiple channels can timeout at a time */
+	mutex_lock(&g->ch_wdt_lock);
+
+	/* Get timed out job and reset the timer */
+	mutex_lock(&ch->timeout.lock);
+	job = ch->timeout.job;
+	ch->timeout.initialized = false;
+	mutex_unlock(&ch->timeout.lock);
+
+	if (gk20a_fifo_disable_all_engine_activity(g, true))
+		goto fail_unlock;
+
+	if (gk20a_fence_is_expired(job->post_fence))
+		goto fail_enable_engine_activity;
+
+	gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out\n",
+		ch->hw_chid);
+
+	/* Get failing engine data */
+	engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg);
+
+	if (engine_id >= g->fifo.max_engines) {
+		/* If no failing engine, abort the channels */
+		if (gk20a_is_channel_marked_as_tsg(ch)) {
+			struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid];
+
+			gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
+			gk20a_fifo_abort_tsg(g, ch->tsgid);
+		} else {
+			gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
+			gk20a_channel_abort(ch);
+		}
+	} else {
+		/* If failing engine, trigger recovery */
+		failing_ch = gk20a_channel_get(&g->fifo.channel[id]);
+		if (!failing_ch)
+			goto fail_enable_engine_activity;
+
+		if (failing_ch->hw_chid != ch->hw_chid)
+			gk20a_channel_timeout_start(ch, job);
+
+		gk20a_fifo_recover(g, BIT(engine_id),
+			failing_ch->hw_chid, is_tsg,
+			true, failing_ch->timeout_debug_dump);
+
+		gk20a_channel_put(failing_ch);
+	}
+
+fail_enable_engine_activity:
+	gk20a_fifo_enable_all_engine_activity(g);
+fail_unlock:
+	mutex_unlock(&g->ch_wdt_lock);
+	gk20a_channel_put(ch);
+}
+
 static int gk20a_channel_add_job(struct channel_gk20a *c,
 				 struct gk20a_fence *pre_fence,
 				 struct gk20a_fence *post_fence)
@@ -1561,6 +1675,8 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 		job->pre_fence = gk20a_fence_get(pre_fence);
 		job->post_fence = gk20a_fence_get(post_fence);
 
+		gk20a_channel_timeout_start(c, job);
+
 		mutex_lock(&c->jobs_lock);
 		list_add_tail(&job->list, &c->jobs);
 		mutex_unlock(&c->jobs_lock);
@@ -1586,8 +1702,12 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
 		struct gk20a *g = c->g;
 
 		bool completed = gk20a_fence_is_expired(job->post_fence);
-		if (!completed)
+		if (!completed) {
+			gk20a_channel_timeout_start(c, job);
 			break;
+		}
+
+		gk20a_channel_timeout_stop(c);
 
 		if (c->sync)
 			c->sync->signal_timeline(c->sync);
@@ -1926,6 +2046,8 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 	mutex_init(&c->ioctl_lock);
 	mutex_init(&c->jobs_lock);
 	mutex_init(&c->submit_lock);
+	mutex_init(&c->timeout.lock);
+	INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler);
 	INIT_LIST_HEAD(&c->jobs);
 #if defined(CONFIG_GK20A_CYCLE_STATS)
 	mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 2ea5b4be..70930291 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -38,6 +38,8 @@ struct gk20a_fence;
 #include "gr_gk20a.h"
 #include "fence_gk20a.h"
 
+#define NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS	5000
+
 struct gpfifo {
 	u32 entry0;
 	u32 entry1;
@@ -70,6 +72,13 @@ struct channel_gk20a_job {
 	struct list_head list;
 };
 
+struct channel_gk20a_timeout {
+	struct delayed_work wq;
+	struct mutex lock;
+	bool initialized;
+	struct channel_gk20a_job *job;
+};
+
 struct channel_gk20a_poll_events {
 	struct mutex lock;
 	bool events_enabled;
@@ -126,6 +135,8 @@ struct channel_gk20a {
 	u32 timeout_accumulated_ms;
 	u32 timeout_gpfifo_get;
 
+	struct channel_gk20a_timeout timeout;
+
 	bool cmds_pending;
 	struct {
 		/* These fences should be accessed with submit_lock held. */
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 069ea82a..f736fe8c 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -852,7 +852,7 @@ static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g,
 	return verbose;
 }
 
-static bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
+bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
 		struct channel_gk20a *ch)
 {
 	gk20a_err(dev_from_gk20a(g),
@@ -861,7 +861,7 @@ static bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
 	return gk20a_fifo_set_ctx_mmu_error(g, ch);
 }
 
-static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
+bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
 		struct tsg_gk20a *tsg)
 {
 	bool ret = true;
@@ -883,7 +883,7 @@ static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
 	return ret;
 }
 
-static void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid)
+void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid)
 {
 	struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
 	struct channel_gk20a *ch;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 929b5c82..3f9fac54 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -183,5 +183,10 @@ u32 gk20a_fifo_engine_interrupt_mask(struct gk20a *g);
 u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g);
 u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
 		int *__id, bool *__is_tsg);
+bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
+		struct tsg_gk20a *tsg);
+void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid);
+bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
+		struct channel_gk20a *ch);
 
 #endif /*__GR_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index c0889571..fb8b8b14 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -667,6 +667,7 @@ static int gk20a_init_support(struct platform_device *dev)
 
 	mutex_init(&g->dbg_sessions_lock);
 	mutex_init(&g->client_lock);
+	mutex_init(&g->ch_wdt_lock);
 
 	g->remove_support = gk20a_remove_support;
 	return 0;
@@ -1449,6 +1450,7 @@ static int gk20a_probe(struct platform_device *dev)
 			CONFIG_GK20A_DEFAULT_TIMEOUT;
 	if (tegra_platform_is_silicon())
 		gk20a->timeouts_enabled = true;
+	gk20a->ch_wdt_enabled = true;
 
 	/* Set up initial power settings. For non-slicon platforms, disable *
 	 * power features and for silicon platforms, read from platform data */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index dd7a7ad4..46940744 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -475,6 +475,9 @@ struct gk20a {
 	u32 gr_idle_timeout_default;
 	u32 timeouts_enabled;
 
+	u32 ch_wdt_enabled;
+	struct mutex ch_wdt_lock;
+
 	bool slcg_enabled;
 	bool blcg_enabled;
 	bool elcg_enabled;
-- 
cgit v1.2.2