gpu: nvgpu: disable ctxsw instead of all engines activity

In gk20a_channel_timeout_handler(), we currently disable all engine activity before checking for fence completion and before we identify timed out channel But disabling all engine activity could be overkill for this process. Also, as part of disabling engine activity we preempt the channel on engine. But it is possible that channel preemption times out since channel has already timed out And this can lead to races and deadlock Hence, instead of disabling all engine activity, just disable the context switch which should also do the same trick Bug 1716062 Change-Id: I596515ed670a2e134f7bcd9758488a4aa0bf16f7 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/929421 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Deepak Nibade <dnibade@nvidia.com> 2016-01-06 04:07:38 -0500
committer: Terje Bergstrom <tbergstrom@nvidia.com> 2016-01-11 12:05:58 -0500
commit: 9713e3572a740216c6ecbc2257349be51c204a67 (patch)
tree: a2d5d82dbc94ca432295338695dafc5dc8cdfc16 /drivers/gpu/nvgpu/gk20a/channel_gk20a.c
parent: 2b064ce65e0035a860d1bc3bcccfcf8aac1f31c7 (diff)
1 files changed, 5 insertions, 5 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 0421c0f6..f0a700ac 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1671,11 +1671,11 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
        ch->timeout.initialized = false;
        mutex_unlock(&ch->timeout.lock);
-        if (gk20a_fifo_disable_all_engine_activity(g, true))
+        if (gr_gk20a_disable_ctxsw(g))
                goto fail_unlock;
        if (gk20a_fence_is_expired(job->post_fence))
-                goto fail_enable_engine_activity;
+                goto fail_enable_ctxsw;
        gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out\n",
                ch->hw_chid);
@@ -1698,7 +1698,7 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
                /* If failing engine, trigger recovery */
                failing_ch = gk20a_channel_get(&g->fifo.channel[id]);
                if (!failing_ch)
-                        goto fail_enable_engine_activity;
+                        goto fail_enable_ctxsw;
                if (failing_ch->hw_chid != ch->hw_chid)
                        gk20a_channel_timeout_start(ch, job);
@@ -1710,8 +1710,8 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
                gk20a_channel_put(failing_ch);
        }
-fail_enable_engine_activity:
+fail_enable_ctxsw:
-        gk20a_fifo_enable_all_engine_activity(g);
+        gr_gk20a_enable_ctxsw(g);
 fail_unlock:
        mutex_unlock(&g->ch_wdt_lock);
        gk20a_channel_put(ch);
author	Deepak Nibade <dnibade@nvidia.com>	2016-01-06 04:07:38 -0500
committer	Terje Bergstrom <tbergstrom@nvidia.com>	2016-01-11 12:05:58 -0500
commit	9713e3572a740216c6ecbc2257349be51c204a67 (patch)
tree	a2d5d82dbc94ca432295338695dafc5dc8cdfc16 /drivers/gpu/nvgpu/gk20a/channel_gk20a.c
parent	2b064ce65e0035a860d1bc3bcccfcf8aac1f31c7 (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 0421c0f6..f0a700ac 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1671,11 +1671,11 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
1671	ch->timeout.initialized = false;	1671	ch->timeout.initialized = false;
1672	mutex_unlock(&ch->timeout.lock);	1672	mutex_unlock(&ch->timeout.lock);
1673		1673
1674	if (gk20a_fifo_disable_all_engine_activity(g, true))	1674	if (gr_gk20a_disable_ctxsw(g))
1675	goto fail_unlock;	1675	goto fail_unlock;
1676		1676
1677	if (gk20a_fence_is_expired(job->post_fence))	1677	if (gk20a_fence_is_expired(job->post_fence))
1678	goto fail_enable_engine_activity;	1678	goto fail_enable_ctxsw;
1679		1679
1680	gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out\n",	1680	gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out\n",
1681	ch->hw_chid);	1681	ch->hw_chid);
@@ -1698,7 +1698,7 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
1698	/* If failing engine, trigger recovery */	1698	/* If failing engine, trigger recovery */
1699	failing_ch = gk20a_channel_get(&g->fifo.channel[id]);	1699	failing_ch = gk20a_channel_get(&g->fifo.channel[id]);
1700	if (!failing_ch)	1700	if (!failing_ch)
1701	goto fail_enable_engine_activity;	1701	goto fail_enable_ctxsw;
1702		1702
1703	if (failing_ch->hw_chid != ch->hw_chid)	1703	if (failing_ch->hw_chid != ch->hw_chid)
1704	gk20a_channel_timeout_start(ch, job);	1704	gk20a_channel_timeout_start(ch, job);
@@ -1710,8 +1710,8 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
1710	gk20a_channel_put(failing_ch);	1710	gk20a_channel_put(failing_ch);
1711	}	1711	}
1712		1712
1713	fail_enable_engine_activity:	1713	fail_enable_ctxsw:
1714	gk20a_fifo_enable_all_engine_activity(g);	1714	gr_gk20a_enable_ctxsw(g);
1715	fail_unlock:	1715	fail_unlock:
1716	mutex_unlock(&g->ch_wdt_lock);	1716	mutex_unlock(&g->ch_wdt_lock);
1717	gk20a_channel_put(ch);	1717	gk20a_channel_put(ch);