summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonsta Holtta <kholtta@nvidia.com>2018-05-08 10:02:25 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2018-05-11 12:53:35 -0400
commit07310de8c1d043b0e5efdcf2d38c28c432b1c9ce (patch)
tree46c3193d7b16f901ebda079c8ada4f4e704aa9ab
parenta7288b58676f14a847592b6d6dcbe9080dfb9edb (diff)
gpu: nvgpu: poll watchdog status actively
Read GP_GET and GET from hardware every time when the poll timer expires instead of when the watchdog timer expires. Restart the watchdog timer if the get pointers have increased since the previous read. This way stuck channels are detected quicker. Previously it could have taken at most twice the watchdog timeout limit for a stuck channel to get recovered; with this change, a coarse sliding window is used. The polling period is still 100 ms. The difference is illustrated in the following diagram: time 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 get a b b b b b b b b b b b b b b b b b b b b prev - n n n n n n n n n A n n n n n n n n n S next - A s s s s s s s s s S "time" represents wall time in polling units; 0 is submit time. For simplicity, watchdog timeout is ten units. "get" is the GP_GET that advances a little from a and then gets stuck at b. "prev" is the previous behaviour, "next" is after this patch. "A" is when the channel is detected as advanced, and "S" when it's found stuck and recovered; small "s" is when it's found stuck but when the time limit has not yet expired and "n" is when the hw state is not read. Bug 1700277 Bug 1982826 Change-Id: Ie2921920d5396cee652729c6a7162b740d7a1f06 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1710554 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c35
1 files changed, 18 insertions, 17 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 0c199146..a4637b8f 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1388,10 +1388,9 @@ void gk20a_channel_timeout_restart_all_channels(struct gk20a *g)
1388/** 1388/**
1389 * Check if a timed out channel has hung and recover it if it has. 1389 * Check if a timed out channel has hung and recover it if it has.
1390 * 1390 *
1391 * Test if this channel has really got stuck at this point (should be called 1391 * Test if this channel has really got stuck at this point by checking if its
1392 * when the watchdog timer has expired) by checking if its gp_get has advanced 1392 * {gp,pb}_get has advanced or not. If no {gp,pb}_get action happened since
1393 * or not. If no gp_get action happened since when the watchdog was started, 1393 * when the watchdog was started and it's timed out, force-reset the channel.
1394 * force-reset the channel.
1395 * 1394 *
1396 * The gpu is implicitly on at this point, because the watchdog can only run on 1395 * The gpu is implicitly on at this point, because the watchdog can only run on
1397 * channels that have submitted jobs pending for cleanup. 1396 * channels that have submitted jobs pending for cleanup.
@@ -1406,22 +1405,27 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
1406 1405
1407 nvgpu_log_fn(g, " "); 1406 nvgpu_log_fn(g, " ");
1408 1407
1409 /* Get status and clear the timer */ 1408 /* Get status but keep timer running */
1410 nvgpu_raw_spinlock_acquire(&ch->timeout.lock); 1409 nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
1411 gp_get = ch->timeout.gp_get; 1410 gp_get = ch->timeout.gp_get;
1412 pb_get = ch->timeout.pb_get; 1411 pb_get = ch->timeout.pb_get;
1413 ch->timeout.running = false;
1414 nvgpu_raw_spinlock_release(&ch->timeout.lock); 1412 nvgpu_raw_spinlock_release(&ch->timeout.lock);
1415 1413
1416 new_gp_get = g->ops.fifo.userd_gp_get(ch->g, ch); 1414 new_gp_get = g->ops.fifo.userd_gp_get(ch->g, ch);
1417 new_pb_get = g->ops.fifo.userd_pb_get(ch->g, ch); 1415 new_pb_get = g->ops.fifo.userd_pb_get(ch->g, ch);
1418 1416
1419 if (new_gp_get != gp_get || new_pb_get != pb_get) { 1417 if (new_gp_get != gp_get || new_pb_get != pb_get) {
1420 /* Channel has advanced, reschedule */ 1418 /* Channel has advanced, rewind timer */
1419 gk20a_channel_timeout_stop(ch);
1421 gk20a_channel_timeout_start(ch); 1420 gk20a_channel_timeout_start(ch);
1422 return; 1421 return;
1423 } 1422 }
1424 1423
1424 if (!nvgpu_timeout_peek_expired(&ch->timeout.timer)) {
1425 /* Seems stuck but waiting to time out */
1426 return;
1427 }
1428
1425 nvgpu_err(g, "Job on channel %d timed out", 1429 nvgpu_err(g, "Job on channel %d timed out",
1426 ch->chid); 1430 ch->chid);
1427 1431
@@ -1435,28 +1439,25 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
1435} 1439}
1436 1440
1437/** 1441/**
1438 * Test if the per-channel timeout is expired and handle the timeout in that case. 1442 * Test if the per-channel watchdog is on; check the timeout in that case.
1439 * 1443 *
1440 * Each channel has an expiration time based watchdog. The timer is 1444 * Each channel has an expiration time based watchdog. The timer is
1441 * (re)initialized in two situations: when a new job is submitted on an idle 1445 * (re)initialized in two situations: when a new job is submitted on an idle
1442 * channel and when the timeout is checked but progress is detected. 1446 * channel and when the timeout is checked but progress is detected. The
1443 * 1447 * watchdog timeout limit is a coarse sliding window.
1444 * Watchdog timeout does not yet necessarily mean a stuck channel so this may
1445 * or may not cause recovery.
1446 * 1448 *
1447 * The timeout is stopped (disabled) after the last job in a row finishes 1449 * The timeout is stopped (disabled) after the last job in a row finishes
1448 * making the channel idle. 1450 * and marks the channel idle.
1449 */ 1451 */
1450static void gk20a_channel_timeout_check(struct channel_gk20a *ch) 1452static void gk20a_channel_timeout_check(struct channel_gk20a *ch)
1451{ 1453{
1452 bool timed_out; 1454 bool running;
1453 1455
1454 nvgpu_raw_spinlock_acquire(&ch->timeout.lock); 1456 nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
1455 timed_out = ch->timeout.running && 1457 running = ch->timeout.running;
1456 nvgpu_timeout_peek_expired(&ch->timeout.timer);
1457 nvgpu_raw_spinlock_release(&ch->timeout.lock); 1458 nvgpu_raw_spinlock_release(&ch->timeout.lock);
1458 1459
1459 if (timed_out) 1460 if (running)
1460 gk20a_channel_timeout_handler(ch); 1461 gk20a_channel_timeout_handler(ch);
1461} 1462}
1462 1463