gpu: nvgpu: gv11b: channel/tsg recovery reorged

Context TSG teardown procedure: 1. Disable scheduling for the engine's runlist via PFIFO_SCHED_DISABLE. This enables SW to determine whether a context has hung later in the process: otherwise, ongoing work on the runlist may keep ENG_STATUS from reaching a steady state. 2. Disable all channels in the TSG being torn down or submit a new runlist that does not contain the TSG. This is to prevent the TSG from being rescheduled once scheduling is reenabled in step 6. 3. a)Initiate a preempt of the TSG by writing NV_PFIFO_PREEMPT with the TSG's ID and the TYPE set to TSG if TSG id is known else do 3b b)Initiate a preempt of the engine by writing the bit associated with its runlist to NV_PFIFO_RUNLIST_PREEMPT. This allows us to begin the preempt process prior to doing the slow register reads needed to determine whether the context has hit any interrupts or is hung. Do not poll NV_PFIFO_RUNLIST_PREEMPT for the preempt to complete. 4. Check for preempt done 5. If a reset is needed as determined by step 4: a. Halt the memory interface for the engine (as per the relevant engine procedure). b. Reset the engine via PMC_ENABLE. c. Take the engine out of reset and reinit the engine (as per the relevant engine procedure) 6. Re-enable scheduling for the engine's runlist via PFIFO_SCHED_ENABLE. Bug 200277163 Change-Id: I1e945a2c6b9845f365d6952109f6803309aa2270 Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1599841 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Seema Khowala <seemaj@nvidia.com> 2017-11-16 16:46:11 -0500
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-11-28 12:46:46 -0500
commit: 3fbb44d7576238d42635e2ca6501a17cdc7306f7 (patch)
tree: e0182e16b6dd13838503d06b111c266c2f1d1b52 /drivers/gpu/nvgpu
parent: 87f42744e0821d54eff7bf0bb863c7e53063e5c1 (diff)
1 files changed, 60 insertions, 51 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 0238ae6c..ae2b6cfc 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -660,7 +660,7 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
                        }
                }
        }
-        gk20a_dbg_info("runlists_mask =  %08x", runlists_mask);
+        nvgpu_log(g, gpu_dbg_info, "runlists_mask = 0x%08x", runlists_mask);
        return runlists_mask;
 }
@@ -873,9 +873,14 @@ static int gv11b_fifo_preempt_runlists(struct gk20a *g, u32 runlists_mask)
                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
        for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
-                if (runlists_mask & fifo_runlist_preempt_runlist_m(runlist_id))
+                if (runlists_mask &
+                                fifo_runlist_preempt_runlist_m(runlist_id)) {
+                        /* during recovery reset engs served by this runlist */
+                        g->fifo.runlist_info[runlist_id].reset_eng_bitmask =
+                                 g->fifo.runlist_info[runlist_id].eng_bitmask;
                        nvgpu_mutex_release(&g->fifo.
                                runlist_info[runlist_id].mutex);
+                }
        }
        return ret;
@@ -955,11 +960,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
        struct fifo_runlist_info_gk20a *runlist = NULL;
        u32 engine_id, client_type = ~0;
-        gk20a_dbg_info("active engine ids bitmask =0x%x", act_eng_bitmask);
+        nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, "
-        gk20a_dbg_info("hw id     =%d", id);
+                        "act_eng_bitmask = 0x%x, mmfault ptr = 0x%p",
-        gk20a_dbg_info("id_type   =%d", id_type);
+                         id, id_type, rc_type, act_eng_bitmask, mmfault);
-        gk20a_dbg_info("rc_type   =%d", rc_type);
-        gk20a_dbg_info("mmu_fault =0x%p", mmfault);
        runlists_mask =  gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id,
                                         id_type, rc_type, mmfault);
@@ -986,25 +989,29 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
        gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);
-        if (rc_type == RC_TYPE_MMU_FAULT)
+        /* Get tsg/ch */
+        if (rc_type == RC_TYPE_MMU_FAULT) {
                gk20a_debug_dump(g);
-        /* get the channel/TSG */
-        if (rc_type == RC_TYPE_MMU_FAULT && mmfault && mmfault->refch) {
                refch = mmfault->refch;
                client_type = mmfault->client_type;
-                if (gk20a_is_channel_marked_as_tsg(refch))
-                        tsg = &g->fifo.tsg[refch->tsgid];
                gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch,
                        mmfault->faulted_pbdma,
                        mmfault->faulted_engine);
-        } else {
+        }
-                if (id_type == ID_TYPE_TSG)
-                        tsg = &g->fifo.tsg[id];
+        if (id_type == ID_TYPE_TSG) {
-                else if (id_type == ID_TYPE_CHANNEL)
+                tsg = &g->fifo.tsg[id];
+        } else if (id_type == ID_TYPE_CHANNEL) {
+                if (refch == NULL)
                        refch = gk20a_channel_get(&g->fifo.channel[id]);
        }
+        /* Disable tsg/ch */
+        if (tsg)
+                gk20a_disable_tsg(tsg);
+        else if (refch)
+                g->ops.fifo.disable_channel(refch);
+        /* Preempt tsg/ch */
        if (id_type == ID_TYPE_TSG || id_type == ID_TYPE_CHANNEL) {
                g->ops.fifo.preempt_ch_tsg(g, id, id_type,
                                         PREEMPT_TIMEOUT_NORC);
@@ -1012,35 +1019,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
                gv11b_fifo_preempt_runlists(g, runlists_mask);
        }
-        if (tsg) {
-                if (!g->fifo.deferred_reset_pending) {
-                        if (rc_type == RC_TYPE_MMU_FAULT) {
-                                gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
-                                verbose = gk20a_fifo_error_tsg(g, tsg);
-                        }
-                }
-                gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
-                if (refch)
-                        gk20a_channel_put(refch);
-        } else if (refch) {
-                if (!g->fifo.deferred_reset_pending) {
-                        if (rc_type == RC_TYPE_MMU_FAULT) {
-                                gk20a_fifo_set_ctx_mmu_error_ch(g, refch);
-                                verbose = gk20a_fifo_error_ch(g, refch);
-                        }
-                }
-                gk20a_channel_abort(refch, false);
-                gk20a_channel_put(refch);
-        } else {
-                nvgpu_err(g, "id unknown, abort runlist");
-                for (runlist_id = 0; runlist_id < g->fifo.max_runlists;
-                                                 runlist_id++) {
-                        if (runlists_mask & BIT(runlist_id))
-                                g->ops.fifo.update_runlist(g, runlist_id,
-                                         FIFO_INVAL_CHANNEL_ID, false, true);
-                }
-        }
        /* check if engine reset should be deferred */
        for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
@@ -1051,7 +1029,8 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
                        unsigned long __reset_eng_bitmask =
                                 runlist->reset_eng_bitmask;
-                        for_each_set_bit(engine_id, &__reset_eng_bitmask, 32) {
+                        for_each_set_bit(engine_id, &__reset_eng_bitmask,
+                                                        g->fifo.max_engines) {
                                if ((refch || tsg) &&
                                         gk20a_fifo_should_defer_engine_reset(g,
                                        engine_id, client_type, false)) {
@@ -1061,7 +1040,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
                                /* handled during channel free */
                                g->fifo.deferred_reset_pending = true;
-                                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
                                   "sm debugger attached,"
                                   " deferring channel recovery to channel free");
                                } else {
@@ -1084,12 +1063,42 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
        }
 #ifdef CONFIG_GK20A_CTXSW_TRACE
-        if (refch)
+        /* tsg and refch both could be valid for mmu fault. Check tsg first */
-                gk20a_ctxsw_trace_channel_reset(g, refch);
+        if (tsg)
-        else if (tsg)
                gk20a_ctxsw_trace_tsg_reset(g, tsg);
+        else if (refch)
+                gk20a_ctxsw_trace_channel_reset(g, refch);
 #endif
+        if (tsg) {
+                if (!g->fifo.deferred_reset_pending) {
+                        if (rc_type == RC_TYPE_MMU_FAULT) {
+                                gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
+                                verbose = gk20a_fifo_error_tsg(g, tsg);
+                        }
+                }
+                gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
+                if (refch)
+                        gk20a_channel_put(refch);
+        } else if (refch) {
+                if (!g->fifo.deferred_reset_pending) {
+                        if (rc_type == RC_TYPE_MMU_FAULT) {
+                                gk20a_fifo_set_ctx_mmu_error_ch(g, refch);
+                                verbose = gk20a_fifo_error_ch(g, refch);
+                        }
+                }
+                gk20a_channel_abort(refch, false);
+                gk20a_channel_put(refch);
+        } else {
+                nvgpu_err(g, "id unknown, abort runlist");
+                for (runlist_id = 0; runlist_id < g->fifo.max_runlists;
+                                                 runlist_id++) {
+                        if (runlists_mask & BIT(runlist_id))
+                                g->ops.fifo.update_runlist(g, runlist_id,
+                                         FIFO_INVAL_CHANNEL_ID, false, true);
+                }
+        }
        gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED,
                                         !RUNLIST_INFO_MUTEX_LOCKED);
author	Seema Khowala <seemaj@nvidia.com>	2017-11-16 16:46:11 -0500
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-11-28 12:46:46 -0500
commit	3fbb44d7576238d42635e2ca6501a17cdc7306f7 (patch)
tree	e0182e16b6dd13838503d06b111c266c2f1d1b52 /drivers/gpu/nvgpu
parent	87f42744e0821d54eff7bf0bb863c7e53063e5c1 (diff)

diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index 0238ae6c..ae2b6cfc 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -660,7 +660,7 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
660	}	660	}
661	}	661	}
662	}	662	}
663	gk20a_dbg_info("runlists_mask = %08x", runlists_mask);	663	nvgpu_log(g, gpu_dbg_info, "runlists_mask = 0x%08x", runlists_mask);
664	return runlists_mask;	664	return runlists_mask;
665	}	665	}
666		666
@@ -873,9 +873,14 @@ static int gv11b_fifo_preempt_runlists(struct gk20a *g, u32 runlists_mask)
873	nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);	873	nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
874		874
875	for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {	875	for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
876	if (runlists_mask & fifo_runlist_preempt_runlist_m(runlist_id))	876	if (runlists_mask &
		877	fifo_runlist_preempt_runlist_m(runlist_id)) {
		878	/* during recovery reset engs served by this runlist */
		879	g->fifo.runlist_info[runlist_id].reset_eng_bitmask =
		880	g->fifo.runlist_info[runlist_id].eng_bitmask;
877	nvgpu_mutex_release(&g->fifo.	881	nvgpu_mutex_release(&g->fifo.
878	runlist_info[runlist_id].mutex);	882	runlist_info[runlist_id].mutex);
		883	}
879	}	884	}
880		885
881	return ret;	886	return ret;
@@ -955,11 +960,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
955	struct fifo_runlist_info_gk20a *runlist = NULL;	960	struct fifo_runlist_info_gk20a *runlist = NULL;
956	u32 engine_id, client_type = ~0;	961	u32 engine_id, client_type = ~0;
957		962
958	gk20a_dbg_info("active engine ids bitmask =0x%x", act_eng_bitmask);	963	nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, "
959	gk20a_dbg_info("hw id =%d", id);	964	"act_eng_bitmask = 0x%x, mmfault ptr = 0x%p",
960	gk20a_dbg_info("id_type =%d", id_type);	965	id, id_type, rc_type, act_eng_bitmask, mmfault);
961	gk20a_dbg_info("rc_type =%d", rc_type);
962	gk20a_dbg_info("mmu_fault =0x%p", mmfault);
963		966
964	runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id,	967	runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id,
965	id_type, rc_type, mmfault);	968	id_type, rc_type, mmfault);
@@ -986,25 +989,29 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
986		989
987	gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);	990	gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);
988		991
989	if (rc_type == RC_TYPE_MMU_FAULT)	992	/* Get tsg/ch */
		993	if (rc_type == RC_TYPE_MMU_FAULT) {
990	gk20a_debug_dump(g);	994	gk20a_debug_dump(g);
991
992	/* get the channel/TSG */
993	if (rc_type == RC_TYPE_MMU_FAULT && mmfault && mmfault->refch) {
994	refch = mmfault->refch;	995	refch = mmfault->refch;
995	client_type = mmfault->client_type;	996	client_type = mmfault->client_type;
996	if (gk20a_is_channel_marked_as_tsg(refch))
997	tsg = &g->fifo.tsg[refch->tsgid];
998	gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch,	997	gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch,
999	mmfault->faulted_pbdma,	998	mmfault->faulted_pbdma,
1000	mmfault->faulted_engine);	999	mmfault->faulted_engine);
1001	} else {	1000	}
1002	if (id_type == ID_TYPE_TSG)	1001
1003	tsg = &g->fifo.tsg[id];	1002	if (id_type == ID_TYPE_TSG) {
1004	else if (id_type == ID_TYPE_CHANNEL)	1003	tsg = &g->fifo.tsg[id];
		1004	} else if (id_type == ID_TYPE_CHANNEL) {
		1005	if (refch == NULL)
1005	refch = gk20a_channel_get(&g->fifo.channel[id]);	1006	refch = gk20a_channel_get(&g->fifo.channel[id]);
1006	}	1007	}
		1008	/* Disable tsg/ch */
		1009	if (tsg)
		1010	gk20a_disable_tsg(tsg);
		1011	else if (refch)
		1012	g->ops.fifo.disable_channel(refch);
1007		1013
		1014	/* Preempt tsg/ch */
1008	if (id_type == ID_TYPE_TSG \|\| id_type == ID_TYPE_CHANNEL) {	1015	if (id_type == ID_TYPE_TSG \|\| id_type == ID_TYPE_CHANNEL) {
1009	g->ops.fifo.preempt_ch_tsg(g, id, id_type,	1016	g->ops.fifo.preempt_ch_tsg(g, id, id_type,
1010	PREEMPT_TIMEOUT_NORC);	1017	PREEMPT_TIMEOUT_NORC);
@@ -1012,35 +1019,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1012	gv11b_fifo_preempt_runlists(g, runlists_mask);	1019	gv11b_fifo_preempt_runlists(g, runlists_mask);
1013	}	1020	}
1014		1021
1015	if (tsg) {
1016	if (!g->fifo.deferred_reset_pending) {
1017	if (rc_type == RC_TYPE_MMU_FAULT) {
1018	gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
1019	verbose = gk20a_fifo_error_tsg(g, tsg);
1020	}
1021	}
1022	gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
1023	if (refch)
1024	gk20a_channel_put(refch);
1025	} else if (refch) {
1026	if (!g->fifo.deferred_reset_pending) {
1027	if (rc_type == RC_TYPE_MMU_FAULT) {
1028	gk20a_fifo_set_ctx_mmu_error_ch(g, refch);
1029	verbose = gk20a_fifo_error_ch(g, refch);
1030	}
1031	}
1032	gk20a_channel_abort(refch, false);
1033	gk20a_channel_put(refch);
1034	} else {
1035	nvgpu_err(g, "id unknown, abort runlist");
1036	for (runlist_id = 0; runlist_id < g->fifo.max_runlists;
1037	runlist_id++) {
1038	if (runlists_mask & BIT(runlist_id))
1039	g->ops.fifo.update_runlist(g, runlist_id,
1040	FIFO_INVAL_CHANNEL_ID, false, true);
1041	}
1042	}
1043
1044	/* check if engine reset should be deferred */	1022	/* check if engine reset should be deferred */
1045	for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {	1023	for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
1046		1024
@@ -1051,7 +1029,8 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1051	unsigned long __reset_eng_bitmask =	1029	unsigned long __reset_eng_bitmask =
1052	runlist->reset_eng_bitmask;	1030	runlist->reset_eng_bitmask;
1053		1031
1054	for_each_set_bit(engine_id, &__reset_eng_bitmask, 32) {	1032	for_each_set_bit(engine_id, &__reset_eng_bitmask,
		1033	g->fifo.max_engines) {
1055	if ((refch \|\| tsg) &&	1034	if ((refch \|\| tsg) &&
1056	gk20a_fifo_should_defer_engine_reset(g,	1035	gk20a_fifo_should_defer_engine_reset(g,
1057	engine_id, client_type, false)) {	1036	engine_id, client_type, false)) {
@@ -1061,7 +1040,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1061		1040
1062	/* handled during channel free */	1041	/* handled during channel free */
1063	g->fifo.deferred_reset_pending = true;	1042	g->fifo.deferred_reset_pending = true;
1064	gk20a_dbg(gpu_dbg_intr \| gpu_dbg_gpu_dbg,	1043	nvgpu_log(g, gpu_dbg_intr \| gpu_dbg_gpu_dbg,
1065	"sm debugger attached,"	1044	"sm debugger attached,"
1066	" deferring channel recovery to channel free");	1045	" deferring channel recovery to channel free");
1067	} else {	1046	} else {
@@ -1084,12 +1063,42 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1084	}	1063	}
1085		1064
1086	#ifdef CONFIG_GK20A_CTXSW_TRACE	1065	#ifdef CONFIG_GK20A_CTXSW_TRACE
1087	if (refch)	1066	/* tsg and refch both could be valid for mmu fault. Check tsg first */
1088	gk20a_ctxsw_trace_channel_reset(g, refch);	1067	if (tsg)
1089	else if (tsg)
1090	gk20a_ctxsw_trace_tsg_reset(g, tsg);	1068	gk20a_ctxsw_trace_tsg_reset(g, tsg);
		1069	else if (refch)
		1070	gk20a_ctxsw_trace_channel_reset(g, refch);
1091	#endif	1071	#endif
1092		1072
		1073	if (tsg) {
		1074	if (!g->fifo.deferred_reset_pending) {
		1075	if (rc_type == RC_TYPE_MMU_FAULT) {
		1076	gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
		1077	verbose = gk20a_fifo_error_tsg(g, tsg);
		1078	}
		1079	}
		1080	gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
		1081	if (refch)
		1082	gk20a_channel_put(refch);
		1083	} else if (refch) {
		1084	if (!g->fifo.deferred_reset_pending) {
		1085	if (rc_type == RC_TYPE_MMU_FAULT) {
		1086	gk20a_fifo_set_ctx_mmu_error_ch(g, refch);
		1087	verbose = gk20a_fifo_error_ch(g, refch);
		1088	}
		1089	}
		1090	gk20a_channel_abort(refch, false);
		1091	gk20a_channel_put(refch);
		1092	} else {
		1093	nvgpu_err(g, "id unknown, abort runlist");
		1094	for (runlist_id = 0; runlist_id < g->fifo.max_runlists;
		1095	runlist_id++) {
		1096	if (runlists_mask & BIT(runlist_id))
		1097	g->ops.fifo.update_runlist(g, runlist_id,
		1098	FIFO_INVAL_CHANNEL_ID, false, true);
		1099	}
		1100	}
		1101
1093	gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED,	1102	gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED,
1094	!RUNLIST_INFO_MUTEX_LOCKED);	1103	!RUNLIST_INFO_MUTEX_LOCKED);
1095		1104