summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu
diff options
context:
space:
mode:
authorKonsta Holtta <kholtta@nvidia.com>2018-10-22 08:22:37 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2019-02-05 04:53:04 -0500
commitaa84e8a9867fdc72812f2609c142bdd21e5d03de (patch)
tree324fd6e728f10f5cefbdee565b8056c3fb6dae88 /drivers/gpu/nvgpu
parentbcac2a22a40e5920e4d88b6dc849f37d55553d02 (diff)
gpu: nvgpu: fix double handling in timeout
The context switch timeout works by triggering a hardware timeout at 10 Hz. When handling these, we check whether a channel has actually timed out. Currently the timeout limit can be shorter than the 10 Hz interval which always causes us to recover a channel but would also cause detection of progress if there was any in the interval. Handling both situations at the same time would reuse the channel pointer local to the function after a loop has finished and would cause memory corruption. Fix this by making the two branches mutually exclusive, and move the recover case to happen first because that's how our tests assume things to work. Jira NVGPU-967 Bug 2502074 Change-Id: I26aa0fa7fd80ab42a9a1a93a6cca2cd29c9d3f3f Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1932449 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> (cherry picked from commit 8ac9a53d816a3d012a6948a9a96ac6db699c662di in dev-kernel) Reviewed-on: https://git-master.nvidia.com/r/1997597 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Tested-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu')
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c46
1 files changed, 24 insertions, 22 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 2e19d585..2dd18370 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -2359,29 +2359,13 @@ bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg,
2359 } 2359 }
2360 } 2360 }
2361 2361
2362 /* if at least one channel in the TSG made some progress, reset
2363 * accumulated timeout for all channels in the TSG. In particular,
2364 * this resets timeout for channels that already completed their work
2365 */
2366 if (progress) {
2367 nvgpu_log_info(g, "progress on tsg=%d ch=%d",
2368 tsg->tsgid, ch->chid);
2369 gk20a_channel_put(ch);
2370 *ms = g->fifo_eng_timeout_us / 1000;
2371 nvgpu_list_for_each_entry(ch, &tsg->ch_list,
2372 channel_gk20a, ch_entry) {
2373 if (gk20a_channel_get(ch)) {
2374 ch->timeout_accumulated_ms = *ms;
2375 gk20a_channel_put(ch);
2376 }
2377 }
2378 }
2379
2380 /* if one channel is presumed dead (no progress for too long), then
2381 * fifo recovery is needed. we can't really figure out which channel
2382 * caused the problem, so set timeout error notifier for all channels.
2383 */
2384 if (recover) { 2362 if (recover) {
2363 /*
2364 * if one channel is presumed dead (no progress for too long),
2365 * then fifo recovery is needed. we can't really figure out
2366 * which channel caused the problem, so set timeout error
2367 * notifier for all channels.
2368 */
2385 nvgpu_log_info(g, "timeout on tsg=%d ch=%d", 2369 nvgpu_log_info(g, "timeout on tsg=%d ch=%d",
2386 tsg->tsgid, ch->chid); 2370 tsg->tsgid, ch->chid);
2387 *ms = ch->timeout_accumulated_ms; 2371 *ms = ch->timeout_accumulated_ms;
@@ -2397,6 +2381,24 @@ bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg,
2397 gk20a_channel_put(ch); 2381 gk20a_channel_put(ch);
2398 } 2382 }
2399 } 2383 }
2384 } else if (progress) {
2385 /*
2386 * if at least one channel in the TSG made some progress, reset
2387 * accumulated timeout for all channels in the TSG. In
2388 * particular, this resets timeout for channels that already
2389 * completed their work
2390 */
2391 nvgpu_log_info(g, "progress on tsg=%d ch=%d",
2392 tsg->tsgid, ch->chid);
2393 gk20a_channel_put(ch);
2394 *ms = g->fifo_eng_timeout_us / 1000;
2395 nvgpu_list_for_each_entry(ch, &tsg->ch_list,
2396 channel_gk20a, ch_entry) {
2397 if (gk20a_channel_get(ch)) {
2398 ch->timeout_accumulated_ms = *ms;
2399 gk20a_channel_put(ch);
2400 }
2401 }
2400 } 2402 }
2401 2403
2402 /* if we could not detect progress on any of the channel, but none 2404 /* if we could not detect progress on any of the channel, but none