gpu: nvgpu: protect recovery with engines_reset_mutex

Rename gr_reset_mutex to engines_reset_mutex and acquire it before initiating recovery. Recovery running in parallel with engine reset is not recommended. On hitting engine reset, h/w drops the ctxsw_status to INVALID in fifo_engine_status register. Also while the engine is held in reset h/w passes busy/idle straight through. fifo_engine_status registers are correct in that there is no context switch outstanding as the CTXSW is aborted when reset is asserted. Use deferred_reset_mutex to protect deferred_reset_pending variable If deferred_reset_pending is true then acquire engines_reset_mutex and call gk20a_fifo_deferred_reset. gk20a_fifo_deferred_reset would also check the value of deferred_reset_pending before initiating reset process Bug 2092051 Bug 2429295 Bug 2484211 Bug 1890287 Change-Id: I47de669a6203e0b2e9a8237ec4e4747339b9837c Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2022373 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> (cherry-picked from cb91bf1e13740023903282d1c2271d9154e940ba in dev-main) Reviewed-on: https://git-master.nvidia.com/r/2024901 GVS: Gerrit_Virtual_Submit Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Debarshi Dutta <ddutta@nvidia.com> 2019-04-30 05:41:31 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2019-05-09 17:42:33 -0400
commit: 6509bb49da19ba9b19e3df64e473b01d54fd310d (patch)
tree: b34d19c88fc122f369b1f22094d9a5e22c67df92 /drivers/gpu/nvgpu/common
parent: 4d8ad643d67ac4044f76976c4085a35fcc5d4095 (diff)
1 files changed, 10 insertions, 9 deletions
diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c
index d30b8ded..4bea032a 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -308,6 +308,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
        struct dbg_session_data *session_data, *tmp_s;
        struct dbg_session_channel_data *ch_data, *tmp;
        int err;
+        bool deferred_reset_pending;
        nvgpu_log_fn(g, " ");
@@ -381,17 +382,17 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
        /* if engine reset was deferred, perform it now */
        nvgpu_mutex_acquire(&f->deferred_reset_mutex);
-        if (g->fifo.deferred_reset_pending) {
+        deferred_reset_pending = g->fifo.deferred_reset_pending;
+        nvgpu_mutex_release(&f->deferred_reset_mutex);
+        if (deferred_reset_pending) {
                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was"
-                           " deferred, running now");
+                                " deferred, running now");
-                /* if lock is already taken, a reset is taking place
+                nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
-                so no need to repeat */
+                gk20a_fifo_deferred_reset(g, ch);
-                if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) {
+                nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
-                        gk20a_fifo_deferred_reset(g, ch);
-                        nvgpu_mutex_release(&g->fifo.gr_reset_mutex);
-                }
        }
-        nvgpu_mutex_release(&f->deferred_reset_mutex);
        if (!gk20a_channel_as_bound(ch)) {
                goto unbind;
author	Debarshi Dutta <ddutta@nvidia.com>	2019-04-30 05:41:31 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2019-05-09 17:42:33 -0400
commit	6509bb49da19ba9b19e3df64e473b01d54fd310d (patch)
tree	b34d19c88fc122f369b1f22094d9a5e22c67df92 /drivers/gpu/nvgpu/common
parent	4d8ad643d67ac4044f76976c4085a35fcc5d4095 (diff)

diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c index d30b8ded..4bea032a 100644 --- a/drivers/gpu/nvgpu/common/fifo/channel.c +++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -308,6 +308,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
308	struct dbg_session_data session_data, tmp_s;	308	struct dbg_session_data session_data, tmp_s;
309	struct dbg_session_channel_data ch_data, tmp;	309	struct dbg_session_channel_data ch_data, tmp;
310	int err;	310	int err;
		311	bool deferred_reset_pending;
311		312
312	nvgpu_log_fn(g, " ");	313	nvgpu_log_fn(g, " ");
313		314
@@ -381,17 +382,17 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
381		382
382	/* if engine reset was deferred, perform it now */	383	/* if engine reset was deferred, perform it now */
383	nvgpu_mutex_acquire(&f->deferred_reset_mutex);	384	nvgpu_mutex_acquire(&f->deferred_reset_mutex);
384	if (g->fifo.deferred_reset_pending) {	385	deferred_reset_pending = g->fifo.deferred_reset_pending;
		386	nvgpu_mutex_release(&f->deferred_reset_mutex);
		387
		388	if (deferred_reset_pending) {
385	nvgpu_log(g, gpu_dbg_intr \| gpu_dbg_gpu_dbg, "engine reset was"	389	nvgpu_log(g, gpu_dbg_intr \| gpu_dbg_gpu_dbg, "engine reset was"
386	" deferred, running now");	390	" deferred, running now");
387	/* if lock is already taken, a reset is taking place	391	nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
388	so no need to repeat */	392	gk20a_fifo_deferred_reset(g, ch);
389	if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) {	393	nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
390	gk20a_fifo_deferred_reset(g, ch);
391	nvgpu_mutex_release(&g->fifo.gr_reset_mutex);
392	}
393	}	394	}
394	nvgpu_mutex_release(&f->deferred_reset_mutex);	395
395		396
396	if (!gk20a_channel_as_bound(ch)) {	397	if (!gk20a_channel_as_bound(ch)) {
397	goto unbind;	398	goto unbind;