From 3fbb44d7576238d42635e2ca6501a17cdc7306f7 Mon Sep 17 00:00:00 2001
From: Seema Khowala <seemaj@nvidia.com>
Date: Thu, 16 Nov 2017 13:46:11 -0800
Subject: gpu: nvgpu: gv11b: channel/tsg recovery reorged

Context TSG teardown procedure:
1. Disable scheduling for the engine's runlist via PFIFO_SCHED_DISABLE.
   This enables SW to determine whether a context has hung later in the
   process: otherwise, ongoing work on the runlist may keep ENG_STATUS from
   reaching a steady state.

2. Disable all channels in the TSG being torn down or submit a new runlist
   that does not contain the TSG.  This is to prevent the TSG from being
   rescheduled once scheduling is reenabled in step 6.

3.
 a)Initiate a preempt of the TSG by writing NV_PFIFO_PREEMPT
   with the TSG's ID and the TYPE set to TSG if TSG id is known else
   do 3b

 b)Initiate a preempt of the engine by writing the bit associated with its
   runlist to NV_PFIFO_RUNLIST_PREEMPT. This allows us to begin the preempt
   process prior to doing the slow register reads needed to determine
   whether the context has hit any interrupts or is hung.  Do not poll
   NV_PFIFO_RUNLIST_PREEMPT for the preempt to complete.

4. Check for preempt done
5. If a reset is needed as determined by step 4:
  a. Halt the memory interface for the engine (as per the relevant engine
     procedure).
  b. Reset the engine via PMC_ENABLE.
  c. Take the engine out of reset and reinit the engine (as per the relevant
     engine procedure)
6. Re-enable scheduling for the engine's runlist via PFIFO_SCHED_ENABLE.

Bug 200277163

Change-Id: I1e945a2c6b9845f365d6952109f6803309aa2270
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1599841
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | 111 +++++++++++++++++++----------------
 1 file changed, 60 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 0238ae6c..ae2b6cfc 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -660,7 +660,7 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
 			}
 		}
 	}
-	gk20a_dbg_info("runlists_mask =  %08x", runlists_mask);
+	nvgpu_log(g, gpu_dbg_info, "runlists_mask = 0x%08x", runlists_mask);
 	return runlists_mask;
 }
 
@@ -873,9 +873,14 @@ static int gv11b_fifo_preempt_runlists(struct gk20a *g, u32 runlists_mask)
 		nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
 
 	for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
-		if (runlists_mask & fifo_runlist_preempt_runlist_m(runlist_id))
+		if (runlists_mask &
+				fifo_runlist_preempt_runlist_m(runlist_id)) {
+			/* during recovery reset engs served by this runlist */
+			g->fifo.runlist_info[runlist_id].reset_eng_bitmask =
+				 g->fifo.runlist_info[runlist_id].eng_bitmask;
 			nvgpu_mutex_release(&g->fifo.
 				runlist_info[runlist_id].mutex);
+		}
 	}
 
 	return ret;
@@ -955,11 +960,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 	struct fifo_runlist_info_gk20a *runlist = NULL;
 	u32 engine_id, client_type = ~0;
 
-	gk20a_dbg_info("active engine ids bitmask =0x%x", act_eng_bitmask);
-	gk20a_dbg_info("hw id     =%d", id);
-	gk20a_dbg_info("id_type   =%d", id_type);
-	gk20a_dbg_info("rc_type   =%d", rc_type);
-	gk20a_dbg_info("mmu_fault =0x%p", mmfault);
+	nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, "
+			"act_eng_bitmask = 0x%x, mmfault ptr = 0x%p",
+			 id, id_type, rc_type, act_eng_bitmask, mmfault);
 
 	runlists_mask =  gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id,
 					 id_type, rc_type, mmfault);
@@ -986,25 +989,29 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 
 	gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);
 
-	if (rc_type == RC_TYPE_MMU_FAULT)
+	/* Get tsg/ch */
+	if (rc_type == RC_TYPE_MMU_FAULT) {
 		gk20a_debug_dump(g);
-
-	/* get the channel/TSG */
-	if (rc_type == RC_TYPE_MMU_FAULT && mmfault && mmfault->refch) {
 		refch = mmfault->refch;
 		client_type = mmfault->client_type;
-		if (gk20a_is_channel_marked_as_tsg(refch))
-			tsg = &g->fifo.tsg[refch->tsgid];
 		gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch,
 			mmfault->faulted_pbdma,
 			mmfault->faulted_engine);
-	} else {
-		if (id_type == ID_TYPE_TSG)
-			tsg = &g->fifo.tsg[id];
-		else if (id_type == ID_TYPE_CHANNEL)
+	}
+
+	if (id_type == ID_TYPE_TSG) {
+		tsg = &g->fifo.tsg[id];
+	} else if (id_type == ID_TYPE_CHANNEL) {
+		if (refch == NULL)
 			refch = gk20a_channel_get(&g->fifo.channel[id]);
 	}
+	/* Disable tsg/ch */
+	if (tsg)
+		gk20a_disable_tsg(tsg);
+	else if (refch)
+		g->ops.fifo.disable_channel(refch);
 
+	/* Preempt tsg/ch */
 	if (id_type == ID_TYPE_TSG || id_type == ID_TYPE_CHANNEL) {
 		g->ops.fifo.preempt_ch_tsg(g, id, id_type,
 					 PREEMPT_TIMEOUT_NORC);
@@ -1012,35 +1019,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 		gv11b_fifo_preempt_runlists(g, runlists_mask);
 	}
 
-	if (tsg) {
-		if (!g->fifo.deferred_reset_pending) {
-			if (rc_type == RC_TYPE_MMU_FAULT) {
-				gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
-				verbose = gk20a_fifo_error_tsg(g, tsg);
-			}
-		}
-		gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
-		if (refch)
-			gk20a_channel_put(refch);
-	} else if (refch) {
-		if (!g->fifo.deferred_reset_pending) {
-			if (rc_type == RC_TYPE_MMU_FAULT) {
-				gk20a_fifo_set_ctx_mmu_error_ch(g, refch);
-				verbose = gk20a_fifo_error_ch(g, refch);
-			}
-		}
-		gk20a_channel_abort(refch, false);
-		gk20a_channel_put(refch);
-	} else {
-		nvgpu_err(g, "id unknown, abort runlist");
-		for (runlist_id = 0; runlist_id < g->fifo.max_runlists;
-						 runlist_id++) {
-			if (runlists_mask & BIT(runlist_id))
-				g->ops.fifo.update_runlist(g, runlist_id,
-					 FIFO_INVAL_CHANNEL_ID, false, true);
-		}
-	}
-
 	/* check if engine reset should be deferred */
 	for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
 
@@ -1051,7 +1029,8 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 			unsigned long __reset_eng_bitmask =
 				 runlist->reset_eng_bitmask;
 
-			for_each_set_bit(engine_id, &__reset_eng_bitmask, 32) {
+			for_each_set_bit(engine_id, &__reset_eng_bitmask,
+							g->fifo.max_engines) {
 				if ((refch || tsg) &&
 					 gk20a_fifo_should_defer_engine_reset(g,
 					engine_id, client_type, false)) {
@@ -1061,7 +1040,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 
 				/* handled during channel free */
 				g->fifo.deferred_reset_pending = true;
-				gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
+				nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
 				   "sm debugger attached,"
 				   " deferring channel recovery to channel free");
 				} else {
@@ -1084,12 +1063,42 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 	}
 
 #ifdef CONFIG_GK20A_CTXSW_TRACE
-	if (refch)
-		gk20a_ctxsw_trace_channel_reset(g, refch);
-	else if (tsg)
+	/* tsg and refch both could be valid for mmu fault. Check tsg first */
+	if (tsg)
 		gk20a_ctxsw_trace_tsg_reset(g, tsg);
+	else if (refch)
+		gk20a_ctxsw_trace_channel_reset(g, refch);
 #endif
 
+	if (tsg) {
+		if (!g->fifo.deferred_reset_pending) {
+			if (rc_type == RC_TYPE_MMU_FAULT) {
+				gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
+				verbose = gk20a_fifo_error_tsg(g, tsg);
+			}
+		}
+		gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
+		if (refch)
+			gk20a_channel_put(refch);
+	} else if (refch) {
+		if (!g->fifo.deferred_reset_pending) {
+			if (rc_type == RC_TYPE_MMU_FAULT) {
+				gk20a_fifo_set_ctx_mmu_error_ch(g, refch);
+				verbose = gk20a_fifo_error_ch(g, refch);
+			}
+		}
+		gk20a_channel_abort(refch, false);
+		gk20a_channel_put(refch);
+	} else {
+		nvgpu_err(g, "id unknown, abort runlist");
+		for (runlist_id = 0; runlist_id < g->fifo.max_runlists;
+						 runlist_id++) {
+			if (runlists_mask & BIT(runlist_id))
+				g->ops.fifo.update_runlist(g, runlist_id,
+					 FIFO_INVAL_CHANNEL_ID, false, true);
+		}
+	}
+
 	gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED,
 					 !RUNLIST_INFO_MUTEX_LOCKED);
 
-- 
cgit v1.2.2