gpu: nvgpu: poll watchdog status actively

Read GP_GET and GET from hardware every time when the poll timer expires instead of when the watchdog timer expires. Restart the watchdog timer if the get pointers have increased since the previous read. This way stuck channels are detected quicker. Previously it could have taken at most twice the watchdog timeout limit for a stuck channel to get recovered; with this change, a coarse sliding window is used. The polling period is still 100 ms. The difference is illustrated in the following diagram: time 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 get a b b b b b b b b b b b b b b b b b b b b prev - n n n n n n n n n A n n n n n n n n n S next - A s s s s s s s s s S "time" represents wall time in polling units; 0 is submit time. For simplicity, watchdog timeout is ten units. "get" is the GP_GET that advances a little from a and then gets stuck at b. "prev" is the previous behaviour, "next" is after this patch. "A" is when the channel is detected as advanced, and "S" when it's found stuck and recovered; small "s" is when it's found stuck but when the time limit has not yet expired and "n" is when the hw state is not read. Bug 1700277 Bug 1982826 Change-Id: Ie2921920d5396cee652729c6a7162b740d7a1f06 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1710554 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Konsta Holtta <kholtta@nvidia.com> 2018-05-08 10:02:25 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2018-05-11 12:53:35 -0400
commit: 07310de8c1d043b0e5efdcf2d38c28c432b1c9ce (patch)
tree: 46c3193d7b16f901ebda079c8ada4f4e704aa9ab
parent: a7288b58676f14a847592b6d6dcbe9080dfb9edb (diff)
1 files changed, 18 insertions, 17 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 0c199146..a4637b8f 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1388,10 +1388,9 @@ void gk20a_channel_timeout_restart_all_channels(struct gk20a *g)
 /**
 * Check if a timed out channel has hung and recover it if it has.
 *
- * Test if this channel has really got stuck at this point (should be called
+ * Test if this channel has really got stuck at this point by checking if its
- * when the watchdog timer has expired) by checking if its gp_get has advanced
+ * {gp,pb}_get has advanced or not. If no {gp,pb}_get action happened since
- * or not. If no gp_get action happened since when the watchdog was started,
+ * when the watchdog was started and it's timed out, force-reset the channel.
- * force-reset the channel.
 *
 * The gpu is implicitly on at this point, because the watchdog can only run on
 * channels that have submitted jobs pending for cleanup.
@@ -1406,22 +1405,27 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
        nvgpu_log_fn(g, " ");
-        /* Get status and clear the timer */
+        /* Get status but keep timer running */
        nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
        gp_get = ch->timeout.gp_get;
        pb_get = ch->timeout.pb_get;
-        ch->timeout.running = false;
        nvgpu_raw_spinlock_release(&ch->timeout.lock);
        new_gp_get = g->ops.fifo.userd_gp_get(ch->g, ch);
        new_pb_get = g->ops.fifo.userd_pb_get(ch->g, ch);
        if (new_gp_get != gp_get || new_pb_get != pb_get) {
-                /* Channel has advanced, reschedule */
+                /* Channel has advanced, rewind timer */
+                gk20a_channel_timeout_stop(ch);
                gk20a_channel_timeout_start(ch);
                return;
        }
+        if (!nvgpu_timeout_peek_expired(&ch->timeout.timer)) {
+                /* Seems stuck but waiting to time out */
+                return;
+        }
        nvgpu_err(g, "Job on channel %d timed out",
                  ch->chid);
@@ -1435,28 +1439,25 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
 }
 /**
- * Test if the per-channel timeout is expired and handle the timeout in that case.
+ * Test if the per-channel watchdog is on; check the timeout in that case.
 *
 * Each channel has an expiration time based watchdog. The timer is
 * (re)initialized in two situations: when a new job is submitted on an idle
- * channel and when the timeout is checked but progress is detected.
+ * channel and when the timeout is checked but progress is detected. The
- *
+ * watchdog timeout limit is a coarse sliding window.
- * Watchdog timeout does not yet necessarily mean a stuck channel so this may
- * or may not cause recovery.
 *
 * The timeout is stopped (disabled) after the last job in a row finishes
- * making the channel idle.
+ * and marks the channel idle.
 */
 static void gk20a_channel_timeout_check(struct channel_gk20a *ch)
 {
-        bool timed_out;
+        bool running;
        nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
-        timed_out = ch->timeout.running &&
+        running = ch->timeout.running;
-                nvgpu_timeout_peek_expired(&ch->timeout.timer);
        nvgpu_raw_spinlock_release(&ch->timeout.lock);
-        if (timed_out)
+        if (running)
                gk20a_channel_timeout_handler(ch);
 }
author	Konsta Holtta <kholtta@nvidia.com>	2018-05-08 10:02:25 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2018-05-11 12:53:35 -0400
commit	07310de8c1d043b0e5efdcf2d38c28c432b1c9ce (patch)
tree	46c3193d7b16f901ebda079c8ada4f4e704aa9ab
parent	a7288b58676f14a847592b6d6dcbe9080dfb9edb (diff)