gpu: nvgpu: fix double handling in timeout

The context switch timeout works by triggering a hardware timeout at 10 Hz. When handling these, we check whether a channel has actually timed out. Currently the timeout limit can be shorter than the 10 Hz interval which always causes us to recover a channel but would also cause detection of progress if there was any in the interval. Handling both situations at the same time would reuse the channel pointer local to the function after a loop has finished and would cause memory corruption. Fix this by making the two branches mutually exclusive, and move the recover case to happen first because that's how our tests assume things to work. Jira NVGPU-967 Bug 2502074 Change-Id: I26aa0fa7fd80ab42a9a1a93a6cca2cd29c9d3f3f Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1932449 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> (cherry picked from commit 8ac9a53d816a3d012a6948a9a96ac6db699c662di in dev-kernel) Reviewed-on: https://git-master.nvidia.com/r/1997597 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Tested-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Konsta Holtta <kholtta@nvidia.com> 2018-10-22 08:22:37 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2019-02-05 04:53:04 -0500
commit: aa84e8a9867fdc72812f2609c142bdd21e5d03de (patch)
tree: 324fd6e728f10f5cefbdee565b8056c3fb6dae88 /drivers/gpu/nvgpu
parent: bcac2a22a40e5920e4d88b6dc849f37d55553d02 (diff)
1 files changed, 24 insertions, 22 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 2e19d585..2dd18370 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -2359,29 +2359,13 @@ bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg,
                }
        }
-        /* if at least one channel in the TSG made some progress, reset
-         * accumulated timeout for all channels in the TSG. In particular,
-         * this resets timeout for channels that already completed their work
-         */
-        if (progress) {
-                nvgpu_log_info(g, "progress on tsg=%d ch=%d",
-                                tsg->tsgid, ch->chid);
-                gk20a_channel_put(ch);
-                *ms = g->fifo_eng_timeout_us / 1000;
-                nvgpu_list_for_each_entry(ch, &tsg->ch_list,
-                                channel_gk20a, ch_entry) {
-                        if (gk20a_channel_get(ch)) {
-                                ch->timeout_accumulated_ms = *ms;
-                                gk20a_channel_put(ch);
-                        }
-                }
-        }
-        /* if one channel is presumed dead (no progress for too long), then
-         * fifo recovery is needed. we can't really figure out which channel
-         * caused the problem, so set timeout error notifier for all channels.
-         */
        if (recover) {
+                /*
+                 * if one channel is presumed dead (no progress for too long),
+                 * then fifo recovery is needed. we can't really figure out
+                 * which channel caused the problem, so set timeout error
+                 * notifier for all channels.
+                 */
                nvgpu_log_info(g, "timeout on tsg=%d ch=%d",
                                tsg->tsgid, ch->chid);
                *ms = ch->timeout_accumulated_ms;
@@ -2397,6 +2381,24 @@ bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg,
                                gk20a_channel_put(ch);
                        }
                }
+        } else if (progress) {
+                /*
+                 * if at least one channel in the TSG made some progress, reset
+                 * accumulated timeout for all channels in the TSG. In
+                 * particular, this resets timeout for channels that already
+                 * completed their work
+                 */
+                nvgpu_log_info(g, "progress on tsg=%d ch=%d",
+                                tsg->tsgid, ch->chid);
+                gk20a_channel_put(ch);
+                *ms = g->fifo_eng_timeout_us / 1000;
+                nvgpu_list_for_each_entry(ch, &tsg->ch_list,
+                                channel_gk20a, ch_entry) {
+                        if (gk20a_channel_get(ch)) {
+                                ch->timeout_accumulated_ms = *ms;
+                                gk20a_channel_put(ch);
+                        }
+                }
        }
        /* if we could not detect progress on any of the channel, but none
author	Konsta Holtta <kholtta@nvidia.com>	2018-10-22 08:22:37 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2019-02-05 04:53:04 -0500
commit	aa84e8a9867fdc72812f2609c142bdd21e5d03de (patch)
tree	324fd6e728f10f5cefbdee565b8056c3fb6dae88 /drivers/gpu/nvgpu
parent	bcac2a22a40e5920e4d88b6dc849f37d55553d02 (diff)