summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c35
1 files changed, 18 insertions, 17 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 0c199146..a4637b8f 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1388,10 +1388,9 @@ void gk20a_channel_timeout_restart_all_channels(struct gk20a *g)
1388/** 1388/**
1389 * Check if a timed out channel has hung and recover it if it has. 1389 * Check if a timed out channel has hung and recover it if it has.
1390 * 1390 *
1391 * Test if this channel has really got stuck at this point (should be called 1391 * Test if this channel has really got stuck at this point by checking if its
1392 * when the watchdog timer has expired) by checking if its gp_get has advanced 1392 * {gp,pb}_get has advanced or not. If no {gp,pb}_get action happened since
1393 * or not. If no gp_get action happened since when the watchdog was started, 1393 * when the watchdog was started and it's timed out, force-reset the channel.
1394 * force-reset the channel.
1395 * 1394 *
1396 * The gpu is implicitly on at this point, because the watchdog can only run on 1395 * The gpu is implicitly on at this point, because the watchdog can only run on
1397 * channels that have submitted jobs pending for cleanup. 1396 * channels that have submitted jobs pending for cleanup.
@@ -1406,22 +1405,27 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
1406 1405
1407 nvgpu_log_fn(g, " "); 1406 nvgpu_log_fn(g, " ");
1408 1407
1409 /* Get status and clear the timer */ 1408 /* Get status but keep timer running */
1410 nvgpu_raw_spinlock_acquire(&ch->timeout.lock); 1409 nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
1411 gp_get = ch->timeout.gp_get; 1410 gp_get = ch->timeout.gp_get;
1412 pb_get = ch->timeout.pb_get; 1411 pb_get = ch->timeout.pb_get;
1413 ch->timeout.running = false;
1414 nvgpu_raw_spinlock_release(&ch->timeout.lock); 1412 nvgpu_raw_spinlock_release(&ch->timeout.lock);
1415 1413
1416 new_gp_get = g->ops.fifo.userd_gp_get(ch->g, ch); 1414 new_gp_get = g->ops.fifo.userd_gp_get(ch->g, ch);
1417 new_pb_get = g->ops.fifo.userd_pb_get(ch->g, ch); 1415 new_pb_get = g->ops.fifo.userd_pb_get(ch->g, ch);
1418 1416
1419 if (new_gp_get != gp_get || new_pb_get != pb_get) { 1417 if (new_gp_get != gp_get || new_pb_get != pb_get) {
1420 /* Channel has advanced, reschedule */ 1418 /* Channel has advanced, rewind timer */
1419 gk20a_channel_timeout_stop(ch);
1421 gk20a_channel_timeout_start(ch); 1420 gk20a_channel_timeout_start(ch);
1422 return; 1421 return;
1423 } 1422 }
1424 1423
1424 if (!nvgpu_timeout_peek_expired(&ch->timeout.timer)) {
1425 /* Seems stuck but waiting to time out */
1426 return;
1427 }
1428
1425 nvgpu_err(g, "Job on channel %d timed out", 1429 nvgpu_err(g, "Job on channel %d timed out",
1426 ch->chid); 1430 ch->chid);
1427 1431
@@ -1435,28 +1439,25 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
1435} 1439}
1436 1440
1437/** 1441/**
1438 * Test if the per-channel timeout is expired and handle the timeout in that case. 1442 * Test if the per-channel watchdog is on; check the timeout in that case.
1439 * 1443 *
1440 * Each channel has an expiration time based watchdog. The timer is 1444 * Each channel has an expiration time based watchdog. The timer is
1441 * (re)initialized in two situations: when a new job is submitted on an idle 1445 * (re)initialized in two situations: when a new job is submitted on an idle
1442 * channel and when the timeout is checked but progress is detected. 1446 * channel and when the timeout is checked but progress is detected. The
1443 * 1447 * watchdog timeout limit is a coarse sliding window.
1444 * Watchdog timeout does not yet necessarily mean a stuck channel so this may
1445 * or may not cause recovery.
1446 * 1448 *
1447 * The timeout is stopped (disabled) after the last job in a row finishes 1449 * The timeout is stopped (disabled) after the last job in a row finishes
1448 * making the channel idle. 1450 * and marks the channel idle.
1449 */ 1451 */
1450static void gk20a_channel_timeout_check(struct channel_gk20a *ch) 1452static void gk20a_channel_timeout_check(struct channel_gk20a *ch)
1451{ 1453{
1452 bool timed_out; 1454 bool running;
1453 1455
1454 nvgpu_raw_spinlock_acquire(&ch->timeout.lock); 1456 nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
1455 timed_out = ch->timeout.running && 1457 running = ch->timeout.running;
1456 nvgpu_timeout_peek_expired(&ch->timeout.timer);
1457 nvgpu_raw_spinlock_release(&ch->timeout.lock); 1458 nvgpu_raw_spinlock_release(&ch->timeout.lock);
1458 1459
1459 if (timed_out) 1460 if (running)
1460 gk20a_channel_timeout_handler(ch); 1461 gk20a_channel_timeout_handler(ch);
1461} 1462}
1462 1463