diff options
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 35 |
1 files changed, 18 insertions, 17 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 0c199146..a4637b8f 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c | |||
@@ -1388,10 +1388,9 @@ void gk20a_channel_timeout_restart_all_channels(struct gk20a *g) | |||
1388 | /** | 1388 | /** |
1389 | * Check if a timed out channel has hung and recover it if it has. | 1389 | * Check if a timed out channel has hung and recover it if it has. |
1390 | * | 1390 | * |
1391 | * Test if this channel has really got stuck at this point (should be called | 1391 | * Test if this channel has really got stuck at this point by checking if its |
1392 | * when the watchdog timer has expired) by checking if its gp_get has advanced | 1392 | * {gp,pb}_get has advanced or not. If no {gp,pb}_get action happened since |
1393 | * or not. If no gp_get action happened since when the watchdog was started, | 1393 | * when the watchdog was started and it's timed out, force-reset the channel. |
1394 | * force-reset the channel. | ||
1395 | * | 1394 | * |
1396 | * The gpu is implicitly on at this point, because the watchdog can only run on | 1395 | * The gpu is implicitly on at this point, because the watchdog can only run on |
1397 | * channels that have submitted jobs pending for cleanup. | 1396 | * channels that have submitted jobs pending for cleanup. |
@@ -1406,22 +1405,27 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch) | |||
1406 | 1405 | ||
1407 | nvgpu_log_fn(g, " "); | 1406 | nvgpu_log_fn(g, " "); |
1408 | 1407 | ||
1409 | /* Get status and clear the timer */ | 1408 | /* Get status but keep timer running */ |
1410 | nvgpu_raw_spinlock_acquire(&ch->timeout.lock); | 1409 | nvgpu_raw_spinlock_acquire(&ch->timeout.lock); |
1411 | gp_get = ch->timeout.gp_get; | 1410 | gp_get = ch->timeout.gp_get; |
1412 | pb_get = ch->timeout.pb_get; | 1411 | pb_get = ch->timeout.pb_get; |
1413 | ch->timeout.running = false; | ||
1414 | nvgpu_raw_spinlock_release(&ch->timeout.lock); | 1412 | nvgpu_raw_spinlock_release(&ch->timeout.lock); |
1415 | 1413 | ||
1416 | new_gp_get = g->ops.fifo.userd_gp_get(ch->g, ch); | 1414 | new_gp_get = g->ops.fifo.userd_gp_get(ch->g, ch); |
1417 | new_pb_get = g->ops.fifo.userd_pb_get(ch->g, ch); | 1415 | new_pb_get = g->ops.fifo.userd_pb_get(ch->g, ch); |
1418 | 1416 | ||
1419 | if (new_gp_get != gp_get || new_pb_get != pb_get) { | 1417 | if (new_gp_get != gp_get || new_pb_get != pb_get) { |
1420 | /* Channel has advanced, reschedule */ | 1418 | /* Channel has advanced, rewind timer */ |
1419 | gk20a_channel_timeout_stop(ch); | ||
1421 | gk20a_channel_timeout_start(ch); | 1420 | gk20a_channel_timeout_start(ch); |
1422 | return; | 1421 | return; |
1423 | } | 1422 | } |
1424 | 1423 | ||
1424 | if (!nvgpu_timeout_peek_expired(&ch->timeout.timer)) { | ||
1425 | /* Seems stuck but waiting to time out */ | ||
1426 | return; | ||
1427 | } | ||
1428 | |||
1425 | nvgpu_err(g, "Job on channel %d timed out", | 1429 | nvgpu_err(g, "Job on channel %d timed out", |
1426 | ch->chid); | 1430 | ch->chid); |
1427 | 1431 | ||
@@ -1435,28 +1439,25 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch) | |||
1435 | } | 1439 | } |
1436 | 1440 | ||
1437 | /** | 1441 | /** |
1438 | * Test if the per-channel timeout is expired and handle the timeout in that case. | 1442 | * Test if the per-channel watchdog is on; check the timeout in that case. |
1439 | * | 1443 | * |
1440 | * Each channel has an expiration time based watchdog. The timer is | 1444 | * Each channel has an expiration time based watchdog. The timer is |
1441 | * (re)initialized in two situations: when a new job is submitted on an idle | 1445 | * (re)initialized in two situations: when a new job is submitted on an idle |
1442 | * channel and when the timeout is checked but progress is detected. | 1446 | * channel and when the timeout is checked but progress is detected. The |
1443 | * | 1447 | * watchdog timeout limit is a coarse sliding window. |
1444 | * Watchdog timeout does not yet necessarily mean a stuck channel so this may | ||
1445 | * or may not cause recovery. | ||
1446 | * | 1448 | * |
1447 | * The timeout is stopped (disabled) after the last job in a row finishes | 1449 | * The timeout is stopped (disabled) after the last job in a row finishes |
1448 | * making the channel idle. | 1450 | * and marks the channel idle. |
1449 | */ | 1451 | */ |
1450 | static void gk20a_channel_timeout_check(struct channel_gk20a *ch) | 1452 | static void gk20a_channel_timeout_check(struct channel_gk20a *ch) |
1451 | { | 1453 | { |
1452 | bool timed_out; | 1454 | bool running; |
1453 | 1455 | ||
1454 | nvgpu_raw_spinlock_acquire(&ch->timeout.lock); | 1456 | nvgpu_raw_spinlock_acquire(&ch->timeout.lock); |
1455 | timed_out = ch->timeout.running && | 1457 | running = ch->timeout.running; |
1456 | nvgpu_timeout_peek_expired(&ch->timeout.timer); | ||
1457 | nvgpu_raw_spinlock_release(&ch->timeout.lock); | 1458 | nvgpu_raw_spinlock_release(&ch->timeout.lock); |
1458 | 1459 | ||
1459 | if (timed_out) | 1460 | if (running) |
1460 | gk20a_channel_timeout_handler(ch); | 1461 | gk20a_channel_timeout_handler(ch); |
1461 | } | 1462 | } |
1462 | 1463 | ||