From 07310de8c1d043b0e5efdcf2d38c28c432b1c9ce Mon Sep 17 00:00:00 2001
From: Konsta Holtta <kholtta@nvidia.com>
Date: Tue, 8 May 2018 17:02:25 +0300
Subject: gpu: nvgpu: poll watchdog status actively

Read GP_GET and GET from hardware every time when the poll timer expires
instead of when the watchdog timer expires. Restart the watchdog timer
if the get pointers have increased since the previous read. This way
stuck channels are detected quicker.

Previously it could have taken at most twice the watchdog timeout limit
for a stuck channel to get recovered; with this change, a coarse sliding
window is used. The polling period is still 100 ms.

The difference is illustrated in the following diagram:

time  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0
get   a b b b b b b b b b b b b b b b b b b b b
prev  - n n n n n n n n n A n n n n n n n n n S
next  - A s s s s s s s s s S

"time" represents wall time in polling units; 0 is submit time. For
simplicity, watchdog timeout is ten units. "get" is the GP_GET that
advances a little from a and then gets stuck at b. "prev" is the
previous behaviour, "next" is after this patch. "A" is when the channel
is detected as advanced, and "S" when it's found stuck and recovered;
small "s" is when it's found stuck but when the time limit has not yet
expired and "n" is when the hw state is not read.

Bug 1700277
Bug 1982826

Change-Id: Ie2921920d5396cee652729c6a7162b740d7a1f06
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1710554
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 35 +++++++++++++++++----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'drivers/gpu/nvgpu/gk20a/channel_gk20a.c')

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 0c199146..a4637b8f 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1388,10 +1388,9 @@ void gk20a_channel_timeout_restart_all_channels(struct gk20a *g)
 /**
  * Check if a timed out channel has hung and recover it if it has.
  *
- * Test if this channel has really got stuck at this point (should be called
- * when the watchdog timer has expired) by checking if its gp_get has advanced
- * or not. If no gp_get action happened since when the watchdog was started,
- * force-reset the channel.
+ * Test if this channel has really got stuck at this point by checking if its
+ * {gp,pb}_get has advanced or not. If no {gp,pb}_get action happened since
+ * when the watchdog was started and it's timed out, force-reset the channel.
  *
  * The gpu is implicitly on at this point, because the watchdog can only run on
  * channels that have submitted jobs pending for cleanup.
@@ -1406,22 +1405,27 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
 
 	nvgpu_log_fn(g, " ");
 
-	/* Get status and clear the timer */
+	/* Get status but keep timer running */
 	nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
 	gp_get = ch->timeout.gp_get;
 	pb_get = ch->timeout.pb_get;
-	ch->timeout.running = false;
 	nvgpu_raw_spinlock_release(&ch->timeout.lock);
 
 	new_gp_get = g->ops.fifo.userd_gp_get(ch->g, ch);
 	new_pb_get = g->ops.fifo.userd_pb_get(ch->g, ch);
 
 	if (new_gp_get != gp_get || new_pb_get != pb_get) {
-		/* Channel has advanced, reschedule */
+		/* Channel has advanced, rewind timer */
+		gk20a_channel_timeout_stop(ch);
 		gk20a_channel_timeout_start(ch);
 		return;
 	}
 
+	if (!nvgpu_timeout_peek_expired(&ch->timeout.timer)) {
+		/* Seems stuck but waiting to time out */
+		return;
+	}
+
 	nvgpu_err(g, "Job on channel %d timed out",
 		  ch->chid);
 
@@ -1435,28 +1439,25 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
 }
 
 /**
- * Test if the per-channel timeout is expired and handle the timeout in that case.
+ * Test if the per-channel watchdog is on; check the timeout in that case.
  *
  * Each channel has an expiration time based watchdog. The timer is
  * (re)initialized in two situations: when a new job is submitted on an idle
- * channel and when the timeout is checked but progress is detected.
- *
- * Watchdog timeout does not yet necessarily mean a stuck channel so this may
- * or may not cause recovery.
+ * channel and when the timeout is checked but progress is detected. The
+ * watchdog timeout limit is a coarse sliding window.
  *
  * The timeout is stopped (disabled) after the last job in a row finishes
- * making the channel idle.
+ * and marks the channel idle.
  */
 static void gk20a_channel_timeout_check(struct channel_gk20a *ch)
 {
-	bool timed_out;
+	bool running;
 
 	nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
-	timed_out = ch->timeout.running &&
-		nvgpu_timeout_peek_expired(&ch->timeout.timer);
+	running = ch->timeout.running;
 	nvgpu_raw_spinlock_release(&ch->timeout.lock);
 
-	if (timed_out)
+	if (running)
 		gk20a_channel_timeout_handler(ch);
 }
 
-- 
cgit v1.2.2