gpu: nvgpu: protect recovery with engines_reset_mutex

Rename gr_reset_mutex to engines_reset_mutex and acquire it before initiating recovery. Recovery running in parallel with engine reset is not recommended. On hitting engine reset, h/w drops the ctxsw_status to INVALID in fifo_engine_status register. Also while the engine is held in reset h/w passes busy/idle straight through. fifo_engine_status registers are correct in that there is no context switch outstanding as the CTXSW is aborted when reset is asserted. Use deferred_reset_mutex to protect deferred_reset_pending variable If deferred_reset_pending is true then acquire engines_reset_mutex and call gk20a_fifo_deferred_reset. gk20a_fifo_deferred_reset would also check the value of deferred_reset_pending before initiating reset process Bug 2092051 Bug 2429295 Bug 2484211 Bug 1890287 Change-Id: I47de669a6203e0b2e9a8237ec4e4747339b9837c Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2022373 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> (cherry-picked from cb91bf1e13740023903282d1c2271d9154e940ba in dev-main) Reviewed-on: https://git-master.nvidia.com/r/2024901 GVS: Gerrit_Virtual_Submit Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Debarshi Dutta <ddutta@nvidia.com> 2019-04-30 05:41:31 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2019-05-09 17:42:33 -0400
commit: 6509bb49da19ba9b19e3df64e473b01d54fd310d (patch)
tree: b34d19c88fc122f369b1f22094d9a5e22c67df92 /drivers/gpu/nvgpu
parent: 4d8ad643d67ac4044f76976c4085a35fcc5d4095 (diff)
4 files changed, 90 insertions, 46 deletions
diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c
index d30b8ded..4bea032a 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -308,6 +308,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
        struct dbg_session_data *session_data, *tmp_s;
        struct dbg_session_channel_data *ch_data, *tmp;
        int err;
+        bool deferred_reset_pending;
        nvgpu_log_fn(g, " ");
@@ -381,17 +382,17 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
        /* if engine reset was deferred, perform it now */
        nvgpu_mutex_acquire(&f->deferred_reset_mutex);
-        if (g->fifo.deferred_reset_pending) {
+        deferred_reset_pending = g->fifo.deferred_reset_pending;
+        nvgpu_mutex_release(&f->deferred_reset_mutex);
+        if (deferred_reset_pending) {
                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was"
-                           " deferred, running now");
+                                " deferred, running now");
-                /* if lock is already taken, a reset is taking place
+                nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
-                so no need to repeat */
+                gk20a_fifo_deferred_reset(g, ch);
-                if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) {
+                nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
-                        gk20a_fifo_deferred_reset(g, ch);
-                        nvgpu_mutex_release(&g->fifo.gr_reset_mutex);
-                }
        }
-        nvgpu_mutex_release(&f->deferred_reset_mutex);
        if (!gk20a_channel_as_bound(ch)) {
                goto unbind;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index b96372b4..5aca7d62 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -910,9 +910,9 @@ int gk20a_init_fifo_setup_sw_common(struct gk20a *g)
                return err;
        }
-        err = nvgpu_mutex_init(&f->gr_reset_mutex);
+        err = nvgpu_mutex_init(&f->engines_reset_mutex);
        if (err) {
-                nvgpu_err(g, "failed to init gr_reset_mutex");
+                nvgpu_err(g, "failed to init engines_reset_mutex");
                return err;
        }
@@ -1581,14 +1581,22 @@ int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch)
 {
        unsigned long engine_id, engines = 0U;
        struct tsg_gk20a *tsg;
+        bool deferred_reset_pending;
+        struct fifo_gk20a *f = &g->fifo;
        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
-        gr_gk20a_disable_ctxsw(g);
-        if (!g->fifo.deferred_reset_pending) {
+        nvgpu_mutex_acquire(&f->deferred_reset_mutex);
-                goto clean_up;
+        deferred_reset_pending = g->fifo.deferred_reset_pending;
+        nvgpu_mutex_release(&f->deferred_reset_mutex);
+        if (!deferred_reset_pending) {
+                nvgpu_mutex_release(&g->dbg_sessions_lock);
+                return 0;
        }
+        gr_gk20a_disable_ctxsw(g);
        tsg = tsg_gk20a_from_ch(ch);
        if (tsg != NULL) {
                engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true);
@@ -1610,8 +1618,10 @@ int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch)
                }
        }
+        nvgpu_mutex_acquire(&f->deferred_reset_mutex);
        g->fifo.deferred_fault_engines = 0;
        g->fifo.deferred_reset_pending = false;
+        nvgpu_mutex_release(&f->deferred_reset_mutex);
 clean_up:
        gr_gk20a_enable_ctxsw(g);
@@ -1632,9 +1642,10 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
        bool verbose = true;
        u32 grfifo_ctl;
-        nvgpu_log_fn(g, " ");
+        bool deferred_reset_pending = false;
+        struct fifo_gk20a *f = &g->fifo;
-        g->fifo.deferred_reset_pending = false;
+        nvgpu_log_fn(g, " ");
        /* Disable power management */
        if (g->support_pmu) {
@@ -1661,6 +1672,9 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
                gk20a_debug_dump(g);
        }
+        nvgpu_mutex_acquire(&f->deferred_reset_mutex);
+        g->fifo.deferred_reset_pending = false;
+        nvgpu_mutex_release(&f->deferred_reset_mutex);
        /* go through all faulted engines */
        for_each_set_bit(engine_mmu_fault_id, &fault_id, 32) {
@@ -1761,17 +1775,17 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
                                g->fifo.deferred_fault_engines |= BIT(engine_id);
                                /* handled during channel free */
+                                nvgpu_mutex_acquire(&f->deferred_reset_mutex);
                                g->fifo.deferred_reset_pending = true;
+                                nvgpu_mutex_release(&f->deferred_reset_mutex);
+                                deferred_reset_pending = true;
                                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
                                           "sm debugger attached,"
                                           " deferring channel recovery to channel free");
                        } else {
-                                /* if lock is already taken, a reset is taking place
+                                gk20a_fifo_reset_engine(g, engine_id);
-                                so no need to repeat */
-                                if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) {
-                                        gk20a_fifo_reset_engine(g, engine_id);
-                                        nvgpu_mutex_release(&g->fifo.gr_reset_mutex);
-                                }
                        }
                }
@@ -1784,7 +1798,7 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
                 * Disable the channel/TSG from hw and increment syncpoints.
                 */
                if (tsg) {
-                        if (g->fifo.deferred_reset_pending) {
+                        if (deferred_reset_pending) {
                                gk20a_disable_tsg(tsg);
                        } else {
                                if (!fake_fault) {
@@ -1847,6 +1861,9 @@ static bool gk20a_fifo_handle_mmu_fault(
        nvgpu_log_fn(g, " ");
+        nvgpu_log_info(g, "acquire engines_reset_mutex");
+        nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
        nvgpu_log_info(g, "acquire runlist_lock for all runlists");
        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
                nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);
@@ -1859,6 +1876,10 @@ static bool gk20a_fifo_handle_mmu_fault(
        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
                nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);
        }
+        nvgpu_log_info(g, "release engines_reset_mutex");
+        nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
        return verbose;
 }
@@ -1954,6 +1975,16 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
        g->ops.fifo.disable_tsg(tsg);
        /*
+         * On hitting engine reset, h/w drops the ctxsw_status to INVALID in
+         * fifo_engine_status register. Also while the engine is held in reset
+         * h/w passes busy/idle straight through. fifo_engine_status registers
+         * are correct in that there is no context switch outstanding
+         * as the CTXSW is aborted when reset is asserted.
+        */
+        nvgpu_log_info(g, "acquire engines_reset_mutex");
+        nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
+        /*
         * stop context switching to prevent engine assignments from
         * changing until engine status is checked to make sure tsg
         * being recovered is not loaded on the engines
@@ -1980,6 +2011,9 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
                }
        }
+        nvgpu_log_info(g, "release engines_reset_mutex");
+        nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
        if (engines) {
                gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose,
                                        rc_type);
@@ -2030,6 +2064,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
        bool id_is_tsg = (id_type == ID_TYPE_TSG) ? true : false;
        u32 rlid;
+        nvgpu_log_info(g, "acquire engines_reset_mutex");
+        nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
        nvgpu_log_info(g, "acquire runlist_lock for all runlists");
        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
                nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);
@@ -2094,6 +2131,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
                nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);
        }
+        nvgpu_log_info(g, "release engines_reset_mutex");
+        nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
 }
 void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 0c9d9101..26365cae 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -184,7 +184,7 @@ struct fifo_gk20a {
        /* zero-kref'd channels here */
        struct nvgpu_list_node free_chs;
        struct nvgpu_mutex free_chs_mutex;
-        struct nvgpu_mutex gr_reset_mutex;
+        struct nvgpu_mutex engines_reset_mutex;
        struct tsg_gk20a *tsg;
        struct nvgpu_mutex tsg_inuse_mutex;
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index b3c59f84..3c2de4f2 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -1024,6 +1024,11 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
        u32 num_runlists = 0;
        unsigned long runlist_served_pbdmas;
+        bool deferred_reset_pending = false;
+        nvgpu_log_info(g, "acquire engines_reset_mutex");
+        nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
        nvgpu_log_fn(g, "acquire runlist_lock for all runlists");
        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
                nvgpu_mutex_acquire(&f->runlist_info[rlid].
@@ -1094,8 +1099,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
        /* Disable runlist scheduler */
        gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED);
-        g->fifo.deferred_reset_pending = false;
        /* Disable power management */
        if (g->support_pmu) {
                if (nvgpu_cg_pg_disable(g) != 0) {
@@ -1143,6 +1146,10 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
                }
        }
+        nvgpu_mutex_acquire(&f->deferred_reset_mutex);
+        g->fifo.deferred_reset_pending = false;
+        nvgpu_mutex_release(&f->deferred_reset_mutex);
        /* check if engine reset should be deferred */
        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
@@ -1159,28 +1166,21 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
                                         gk20a_fifo_should_defer_engine_reset(g,
                                        engine_id, client_type, false)) {
-                                g->fifo.deferred_fault_engines |=
+                                        g->fifo.deferred_fault_engines |=
                                                         BIT(engine_id);
-                                /* handled during channel free */
+                                        /* handled during channel free */
-                                g->fifo.deferred_reset_pending = true;
+                                        nvgpu_mutex_acquire(&f->deferred_reset_mutex);
-                                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                        g->fifo.deferred_reset_pending = true;
-                                   "sm debugger attached,"
+                                        nvgpu_mutex_release(&f->deferred_reset_mutex);
-                                   " deferring channel recovery to channel free");
+                                        deferred_reset_pending = true;
+                                        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                           "sm debugger attached,"
+                                           " deferring channel recovery to channel free");
                                } else {
-                                        /*
+                                        gk20a_fifo_reset_engine(g, engine_id);
-                                         * if lock is already taken, a reset is
-                                         * taking place so no need to repeat
-                                         */
-                                        if (nvgpu_mutex_tryacquire(
-                                                &g->fifo.gr_reset_mutex)) {
-                                                gk20a_fifo_reset_engine(g,
-                                                                 engine_id);
-                                                nvgpu_mutex_release(
-                                                 &g->fifo.gr_reset_mutex);
-                                        }
                                }
                        }
                }
@@ -1191,7 +1191,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
                gk20a_ctxsw_trace_tsg_reset(g, tsg);
 #endif
        if (tsg) {
-                if (g->fifo.deferred_reset_pending) {
+                if (deferred_reset_pending) {
                        gk20a_disable_tsg(tsg);
                } else {
                        if (rc_type == RC_TYPE_MMU_FAULT) {
@@ -1228,6 +1228,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
                                runlist_lock);
                }
        }
+        nvgpu_log_info(g, "release engines_reset_mutex");
+        nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
 }
 void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f)
author	Debarshi Dutta <ddutta@nvidia.com>	2019-04-30 05:41:31 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2019-05-09 17:42:33 -0400
commit	6509bb49da19ba9b19e3df64e473b01d54fd310d (patch)
tree	b34d19c88fc122f369b1f22094d9a5e22c67df92 /drivers/gpu/nvgpu
parent	4d8ad643d67ac4044f76976c4085a35fcc5d4095 (diff)

diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c index d30b8ded..4bea032a 100644 --- a/drivers/gpu/nvgpu/common/fifo/channel.c +++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -308,6 +308,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
308	struct dbg_session_data session_data, tmp_s;	308	struct dbg_session_data session_data, tmp_s;
309	struct dbg_session_channel_data ch_data, tmp;	309	struct dbg_session_channel_data ch_data, tmp;
310	int err;	310	int err;
		311	bool deferred_reset_pending;
311		312
312	nvgpu_log_fn(g, " ");	313	nvgpu_log_fn(g, " ");
313		314
@@ -381,17 +382,17 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
381		382
382	/* if engine reset was deferred, perform it now */	383	/* if engine reset was deferred, perform it now */
383	nvgpu_mutex_acquire(&f->deferred_reset_mutex);	384	nvgpu_mutex_acquire(&f->deferred_reset_mutex);
384	if (g->fifo.deferred_reset_pending) {	385	deferred_reset_pending = g->fifo.deferred_reset_pending;
		386	nvgpu_mutex_release(&f->deferred_reset_mutex);
		387
		388	if (deferred_reset_pending) {
385	nvgpu_log(g, gpu_dbg_intr \| gpu_dbg_gpu_dbg, "engine reset was"	389	nvgpu_log(g, gpu_dbg_intr \| gpu_dbg_gpu_dbg, "engine reset was"
386	" deferred, running now");	390	" deferred, running now");
387	/* if lock is already taken, a reset is taking place	391	nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
388	so no need to repeat */	392	gk20a_fifo_deferred_reset(g, ch);
389	if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) {	393	nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
390	gk20a_fifo_deferred_reset(g, ch);
391	nvgpu_mutex_release(&g->fifo.gr_reset_mutex);
392	}
393	}	394	}
394	nvgpu_mutex_release(&f->deferred_reset_mutex);	395
395		396
396	if (!gk20a_channel_as_bound(ch)) {	397	if (!gk20a_channel_as_bound(ch)) {
397	goto unbind;	398	goto unbind;


diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index b96372b4..5aca7d62 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -910,9 +910,9 @@ int gk20a_init_fifo_setup_sw_common(struct gk20a *g)
910	return err;	910	return err;
911	}	911	}
912		912
913	err = nvgpu_mutex_init(&f->gr_reset_mutex);	913	err = nvgpu_mutex_init(&f->engines_reset_mutex);
914	if (err) {	914	if (err) {
915	nvgpu_err(g, "failed to init gr_reset_mutex");	915	nvgpu_err(g, "failed to init engines_reset_mutex");
916	return err;	916	return err;
917	}	917	}
918		918
@@ -1581,14 +1581,22 @@ int gk20a_fifo_deferred_reset(struct gk20a g, struct channel_gk20a ch)
1581	{	1581	{
1582	unsigned long engine_id, engines = 0U;	1582	unsigned long engine_id, engines = 0U;
1583	struct tsg_gk20a *tsg;	1583	struct tsg_gk20a *tsg;
		1584	bool deferred_reset_pending;
		1585	struct fifo_gk20a *f = &g->fifo;
1584		1586
1585	nvgpu_mutex_acquire(&g->dbg_sessions_lock);	1587	nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1586	gr_gk20a_disable_ctxsw(g);
1587		1588
1588	if (!g->fifo.deferred_reset_pending) {	1589	nvgpu_mutex_acquire(&f->deferred_reset_mutex);
1589	goto clean_up;	1590	deferred_reset_pending = g->fifo.deferred_reset_pending;
		1591	nvgpu_mutex_release(&f->deferred_reset_mutex);
		1592
		1593	if (!deferred_reset_pending) {
		1594	nvgpu_mutex_release(&g->dbg_sessions_lock);
		1595	return 0;
1590	}	1596	}
1591		1597
		1598	gr_gk20a_disable_ctxsw(g);
		1599
1592	tsg = tsg_gk20a_from_ch(ch);	1600	tsg = tsg_gk20a_from_ch(ch);
1593	if (tsg != NULL) {	1601	if (tsg != NULL) {
1594	engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true);	1602	engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true);
@@ -1610,8 +1618,10 @@ int gk20a_fifo_deferred_reset(struct gk20a g, struct channel_gk20a ch)
1610	}	1618	}
1611	}	1619	}
1612		1620
		1621	nvgpu_mutex_acquire(&f->deferred_reset_mutex);
1613	g->fifo.deferred_fault_engines = 0;	1622	g->fifo.deferred_fault_engines = 0;
1614	g->fifo.deferred_reset_pending = false;	1623	g->fifo.deferred_reset_pending = false;
		1624	nvgpu_mutex_release(&f->deferred_reset_mutex);
1615		1625
1616	clean_up:	1626	clean_up:
1617	gr_gk20a_enable_ctxsw(g);	1627	gr_gk20a_enable_ctxsw(g);
@@ -1632,9 +1642,10 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
1632	bool verbose = true;	1642	bool verbose = true;
1633	u32 grfifo_ctl;	1643	u32 grfifo_ctl;
1634		1644
1635	nvgpu_log_fn(g, " ");	1645	bool deferred_reset_pending = false;
		1646	struct fifo_gk20a *f = &g->fifo;
1636		1647
1637	g->fifo.deferred_reset_pending = false;	1648	nvgpu_log_fn(g, " ");
1638		1649
1639	/* Disable power management */	1650	/* Disable power management */
1640	if (g->support_pmu) {	1651	if (g->support_pmu) {
@@ -1661,6 +1672,9 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
1661	gk20a_debug_dump(g);	1672	gk20a_debug_dump(g);
1662	}	1673	}
1663		1674
		1675	nvgpu_mutex_acquire(&f->deferred_reset_mutex);
		1676	g->fifo.deferred_reset_pending = false;
		1677	nvgpu_mutex_release(&f->deferred_reset_mutex);
1664		1678
1665	/* go through all faulted engines */	1679	/* go through all faulted engines */
1666	for_each_set_bit(engine_mmu_fault_id, &fault_id, 32) {	1680	for_each_set_bit(engine_mmu_fault_id, &fault_id, 32) {
@@ -1761,17 +1775,17 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
1761	g->fifo.deferred_fault_engines \|= BIT(engine_id);	1775	g->fifo.deferred_fault_engines \|= BIT(engine_id);
1762		1776
1763	/* handled during channel free */	1777	/* handled during channel free */
		1778	nvgpu_mutex_acquire(&f->deferred_reset_mutex);
1764	g->fifo.deferred_reset_pending = true;	1779	g->fifo.deferred_reset_pending = true;
		1780	nvgpu_mutex_release(&f->deferred_reset_mutex);
		1781
		1782	deferred_reset_pending = true;
		1783
1765	nvgpu_log(g, gpu_dbg_intr \| gpu_dbg_gpu_dbg,	1784	nvgpu_log(g, gpu_dbg_intr \| gpu_dbg_gpu_dbg,
1766	"sm debugger attached,"	1785	"sm debugger attached,"
1767	" deferring channel recovery to channel free");	1786	" deferring channel recovery to channel free");
1768	} else {	1787	} else {
1769	/* if lock is already taken, a reset is taking place	1788	gk20a_fifo_reset_engine(g, engine_id);
1770	so no need to repeat */
1771	if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) {
1772	gk20a_fifo_reset_engine(g, engine_id);
1773	nvgpu_mutex_release(&g->fifo.gr_reset_mutex);
1774	}
1775	}	1789	}
1776	}	1790	}
1777		1791
@@ -1784,7 +1798,7 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
1784	* Disable the channel/TSG from hw and increment syncpoints.	1798	* Disable the channel/TSG from hw and increment syncpoints.
1785	*/	1799	*/
1786	if (tsg) {	1800	if (tsg) {
1787	if (g->fifo.deferred_reset_pending) {	1801	if (deferred_reset_pending) {
1788	gk20a_disable_tsg(tsg);	1802	gk20a_disable_tsg(tsg);
1789	} else {	1803	} else {
1790	if (!fake_fault) {	1804	if (!fake_fault) {
@@ -1847,6 +1861,9 @@ static bool gk20a_fifo_handle_mmu_fault(
1847		1861
1848	nvgpu_log_fn(g, " ");	1862	nvgpu_log_fn(g, " ");
1849		1863
		1864	nvgpu_log_info(g, "acquire engines_reset_mutex");
		1865	nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
		1866
1850	nvgpu_log_info(g, "acquire runlist_lock for all runlists");	1867	nvgpu_log_info(g, "acquire runlist_lock for all runlists");
1851	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {	1868	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
1852	nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);	1869	nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);
@@ -1859,6 +1876,10 @@ static bool gk20a_fifo_handle_mmu_fault(
1859	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {	1876	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
1860	nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);	1877	nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);
1861	}	1878	}
		1879
		1880	nvgpu_log_info(g, "release engines_reset_mutex");
		1881	nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
		1882
1862	return verbose;	1883	return verbose;
1863	}	1884	}
1864		1885
@@ -1954,6 +1975,16 @@ void gk20a_fifo_recover_tsg(struct gk20a g, struct tsg_gk20a tsg,
1954	g->ops.fifo.disable_tsg(tsg);	1975	g->ops.fifo.disable_tsg(tsg);
1955		1976
1956	/*	1977	/*
		1978	* On hitting engine reset, h/w drops the ctxsw_status to INVALID in
		1979	* fifo_engine_status register. Also while the engine is held in reset
		1980	* h/w passes busy/idle straight through. fifo_engine_status registers
		1981	* are correct in that there is no context switch outstanding
		1982	* as the CTXSW is aborted when reset is asserted.
		1983	*/
		1984	nvgpu_log_info(g, "acquire engines_reset_mutex");
		1985	nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
		1986
		1987	/*
1957	* stop context switching to prevent engine assignments from	1988	* stop context switching to prevent engine assignments from
1958	* changing until engine status is checked to make sure tsg	1989	* changing until engine status is checked to make sure tsg
1959	* being recovered is not loaded on the engines	1990	* being recovered is not loaded on the engines
@@ -1980,6 +2011,9 @@ void gk20a_fifo_recover_tsg(struct gk20a g, struct tsg_gk20a tsg,
1980	}	2011	}
1981	}	2012	}
1982		2013
		2014	nvgpu_log_info(g, "release engines_reset_mutex");
		2015	nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
		2016
1983	if (engines) {	2017	if (engines) {
1984	gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose,	2018	gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose,
1985	rc_type);	2019	rc_type);
@@ -2030,6 +2064,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
2030	bool id_is_tsg = (id_type == ID_TYPE_TSG) ? true : false;	2064	bool id_is_tsg = (id_type == ID_TYPE_TSG) ? true : false;
2031	u32 rlid;	2065	u32 rlid;
2032		2066
		2067	nvgpu_log_info(g, "acquire engines_reset_mutex");
		2068	nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
		2069
2033	nvgpu_log_info(g, "acquire runlist_lock for all runlists");	2070	nvgpu_log_info(g, "acquire runlist_lock for all runlists");
2034	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {	2071	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
2035	nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);	2072	nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);
@@ -2094,6 +2131,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
2094	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {	2131	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
2095	nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);	2132	nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);
2096	}	2133	}
		2134
		2135	nvgpu_log_info(g, "release engines_reset_mutex");
		2136	nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
2097	}	2137	}
2098		2138
2099	void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,	2139	void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,


diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 0c9d9101..26365cae 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -184,7 +184,7 @@ struct fifo_gk20a {
184	/* zero-kref'd channels here */	184	/* zero-kref'd channels here */
185	struct nvgpu_list_node free_chs;	185	struct nvgpu_list_node free_chs;
186	struct nvgpu_mutex free_chs_mutex;	186	struct nvgpu_mutex free_chs_mutex;
187	struct nvgpu_mutex gr_reset_mutex;	187	struct nvgpu_mutex engines_reset_mutex;
188		188
189	struct tsg_gk20a *tsg;	189	struct tsg_gk20a *tsg;
190	struct nvgpu_mutex tsg_inuse_mutex;	190	struct nvgpu_mutex tsg_inuse_mutex;


diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index b3c59f84..3c2de4f2 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -1024,6 +1024,11 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1024	u32 num_runlists = 0;	1024	u32 num_runlists = 0;
1025	unsigned long runlist_served_pbdmas;	1025	unsigned long runlist_served_pbdmas;
1026		1026
		1027	bool deferred_reset_pending = false;
		1028
		1029	nvgpu_log_info(g, "acquire engines_reset_mutex");
		1030	nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
		1031
1027	nvgpu_log_fn(g, "acquire runlist_lock for all runlists");	1032	nvgpu_log_fn(g, "acquire runlist_lock for all runlists");
1028	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {	1033	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
1029	nvgpu_mutex_acquire(&f->runlist_info[rlid].	1034	nvgpu_mutex_acquire(&f->runlist_info[rlid].
@@ -1094,8 +1099,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1094	/* Disable runlist scheduler */	1099	/* Disable runlist scheduler */
1095	gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED);	1100	gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED);
1096		1101
1097	g->fifo.deferred_reset_pending = false;
1098
1099	/* Disable power management */	1102	/* Disable power management */
1100	if (g->support_pmu) {	1103	if (g->support_pmu) {
1101	if (nvgpu_cg_pg_disable(g) != 0) {	1104	if (nvgpu_cg_pg_disable(g) != 0) {
@@ -1143,6 +1146,10 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1143	}	1146	}
1144	}	1147	}
1145		1148
		1149	nvgpu_mutex_acquire(&f->deferred_reset_mutex);
		1150	g->fifo.deferred_reset_pending = false;
		1151	nvgpu_mutex_release(&f->deferred_reset_mutex);
		1152
1146	/* check if engine reset should be deferred */	1153	/* check if engine reset should be deferred */
1147	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {	1154	for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
1148		1155
@@ -1159,28 +1166,21 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1159	gk20a_fifo_should_defer_engine_reset(g,	1166	gk20a_fifo_should_defer_engine_reset(g,
1160	engine_id, client_type, false)) {	1167	engine_id, client_type, false)) {
1161		1168
1162	g->fifo.deferred_fault_engines \|=	1169	g->fifo.deferred_fault_engines \|=
1163	BIT(engine_id);	1170	BIT(engine_id);
1164		1171
1165	/* handled during channel free */	1172	/* handled during channel free */
1166	g->fifo.deferred_reset_pending = true;	1173	nvgpu_mutex_acquire(&f->deferred_reset_mutex);
1167	nvgpu_log(g, gpu_dbg_intr \| gpu_dbg_gpu_dbg,	1174	g->fifo.deferred_reset_pending = true;
1168	"sm debugger attached,"	1175	nvgpu_mutex_release(&f->deferred_reset_mutex);
1169	" deferring channel recovery to channel free");	1176
		1177	deferred_reset_pending = true;
		1178
		1179	nvgpu_log(g, gpu_dbg_intr \| gpu_dbg_gpu_dbg,
		1180	"sm debugger attached,"
		1181	" deferring channel recovery to channel free");
1170	} else {	1182	} else {
1171	/*	1183	gk20a_fifo_reset_engine(g, engine_id);
1172	* if lock is already taken, a reset is
1173	* taking place so no need to repeat
1174	*/
1175	if (nvgpu_mutex_tryacquire(
1176	&g->fifo.gr_reset_mutex)) {
1177
1178	gk20a_fifo_reset_engine(g,
1179	engine_id);
1180
1181	nvgpu_mutex_release(
1182	&g->fifo.gr_reset_mutex);
1183	}
1184	}	1184	}
1185	}	1185	}
1186	}	1186	}
@@ -1191,7 +1191,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1191	gk20a_ctxsw_trace_tsg_reset(g, tsg);	1191	gk20a_ctxsw_trace_tsg_reset(g, tsg);
1192	#endif	1192	#endif
1193	if (tsg) {	1193	if (tsg) {
1194	if (g->fifo.deferred_reset_pending) {	1194	if (deferred_reset_pending) {
1195	gk20a_disable_tsg(tsg);	1195	gk20a_disable_tsg(tsg);
1196	} else {	1196	} else {
1197	if (rc_type == RC_TYPE_MMU_FAULT) {	1197	if (rc_type == RC_TYPE_MMU_FAULT) {
@@ -1228,6 +1228,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1228	runlist_lock);	1228	runlist_lock);
1229	}	1229	}
1230	}	1230	}
		1231
		1232	nvgpu_log_info(g, "release engines_reset_mutex");
		1233	nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
1231	}	1234	}
1232		1235
1233	void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f)	1236	void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f)