From cd6e821cf66837a2c3479e928414007064b9c496 Mon Sep 17 00:00:00 2001
From: Seema Khowala <seemaj@nvidia.com>
Date: Fri, 23 Feb 2018 13:00:00 -0800
Subject: gpu: nvgpu: gv11b: add runlist abort & remove bare channel

-Add support for aborting runlist/s. Aborting runlist/s,
 will abort all active tsgs and associated active channels
 within these active tsgs
-Bare channels are no longer supported. Remove recovery
 support for bare channels. In case there are bare
 channels, recovery will trigger runlist abort

Bug 2125776
Bug 2108544
Bug 2105322
Bug 2092051
Bug 2048824
Bug 2043838
Bug 2039587
Bug 2028993
Bug 2029245
Bug 2065990
Bug 1945121
Bug 200401707
Bug 200393631
Bug 200327596

Change-Id: I6bec8a0004508cf65ea128bf641a26bf4c2f236d
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1640567
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c |   6 +-
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.h |   3 +
 drivers/gpu/nvgpu/gv11b/fb_gv11b.c   |  79 ++++++++++++-------
 drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | 149 ++++++++++++++++++-----------------
 drivers/gpu/nvgpu/gv11b/fifo_gv11b.h |   2 +-
 5 files changed, 135 insertions(+), 104 deletions(-)

diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index c8789c3a..dc1f48b7 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -55,9 +55,7 @@
 #define FECS_METHOD_WFI_RESTORE 0x80000
 #define FECS_MAILBOX_0_ACK_RESTORE 0x4
 
-static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
-					    u32 chid, bool add,
-					    bool wait_for_finish);
+
 static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg);
 
 static const char *const pbdma_intr_fault_type_desc[] = {
@@ -3275,7 +3273,7 @@ void gk20a_fifo_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
 		fifo_eng_runlist_length_f(count));
 }
 
-static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
+int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 					    u32 chid, bool add,
 					    bool wait_for_finish)
 {
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index d6e759ac..77030c94 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -254,6 +254,9 @@ int nvgpu_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next,
 int gk20a_fifo_update_runlist(struct gk20a *g, u32 engine_id, u32 chid,
 			      bool add, bool wait_for_finish);
 
+int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
+					    u32 chid, bool add,
+					    bool wait_for_finish);
 int gk20a_fifo_suspend(struct gk20a *g);
 
 bool gk20a_fifo_mmu_fault_pending(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
index 54f0d2d8..2ceb816b 100644
--- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
@@ -870,10 +870,11 @@ static void gv11b_fb_copy_from_hw_fault_buf(struct gk20a *g,
 static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
 		 struct mmu_fault_info *mmfault, u32 *invalidate_replay_val)
 {
-	unsigned int id_type;
+	unsigned int id_type = ID_TYPE_UNKNOWN;
 	u32 num_lce, act_eng_bitmask = 0;
 	int err = 0;
-	u32 id = ((u32)~0);
+	u32 id = FIFO_INVAL_TSG_ID;
+	unsigned int rc_type = RC_TYPE_NO_RC;
 
 	if (!mmfault->valid)
 		return;
@@ -888,18 +889,23 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
 		/* CE page faults are not reported as replayable */
 		nvgpu_log(g, gpu_dbg_intr, "CE Faulted");
 		err = gv11b_fb_fix_page_fault(g, mmfault);
-		gv11b_fifo_reset_pbdma_and_eng_faulted(g, mmfault->refch,
-			mmfault->faulted_pbdma, mmfault->faulted_engine);
+		if (mmfault->refch &&
+			(u32)mmfault->refch->tsgid != FIFO_INVAL_TSG_ID) {
+			gv11b_fifo_reset_pbdma_and_eng_faulted(g,
+				&g->fifo.tsg[mmfault->refch->tsgid],
+				mmfault->faulted_pbdma,
+				mmfault->faulted_engine);
+		}
 		if (!err) {
 			nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Fixed");
 			*invalidate_replay_val = 0;
-			/* refch in mmfault is assigned at the time of copying
-			 * fault info from snap reg or bar2 fault buf
-			 */
-			gk20a_channel_put(mmfault->refch);
+			if (mmfault->refch) {
+				gk20a_channel_put(mmfault->refch);
+				mmfault->refch = NULL;
+			}
 			return;
 		}
-		/* Do recovery. Channel recovery needs refch */
+		/* Do recovery */
 		nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Not Fixed");
 	}
 
@@ -911,16 +917,9 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
 		 * instance block, the fault cannot be isolated to a
 		 * single context so we need to reset the entire runlist
 		 */
-		id_type = ID_TYPE_UNKNOWN;
+			rc_type = RC_TYPE_MMU_FAULT;
 
 		} else if (mmfault->refch) {
-			if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) {
-				id = mmfault->refch->tsgid;
-				id_type = ID_TYPE_TSG;
-			} else {
-				id = mmfault->chid;
-				id_type = ID_TYPE_CHANNEL;
-			}
 			if (mmfault->refch->mmu_nack_handled) {
 				/* We have already recovered for the same
 				 * context, skip doing another recovery.
@@ -941,19 +940,40 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
 				 */
 				gk20a_channel_put(mmfault->refch);
 				return;
+			} else {
+				/* Indicate recovery is handled if mmu fault is
+				 * a result of mmu nack.
+				 */
+				mmfault->refch->mmu_nack_handled = true;
+			}
+
+			rc_type = RC_TYPE_MMU_FAULT;
+			if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) {
+				id = mmfault->refch->tsgid;
+				if (id != FIFO_INVAL_TSG_ID)
+					id_type = ID_TYPE_TSG;
+			} else {
+				nvgpu_err(g, "bare channels not supported");
 			}
-		} else {
-			id_type = ID_TYPE_UNKNOWN;
 		}
-		if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
+
+		/* engine is faulted */
+		if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID) {
 			act_eng_bitmask = BIT(mmfault->faulted_engine);
+			rc_type = RC_TYPE_MMU_FAULT;
+		}
 
-		/* Indicate recovery is handled if mmu fault is a result of
-		 * mmu nack.
+		/* refch in mmfault is assigned at the time of copying
+		 * fault info from snap reg or bar2 fault buf
 		 */
-		mmfault->refch->mmu_nack_handled = true;
-		g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
-			id, id_type, RC_TYPE_MMU_FAULT, mmfault);
+		if (mmfault->refch) {
+			gk20a_channel_put(mmfault->refch);
+			mmfault->refch = NULL;
+		}
+
+		if (rc_type != RC_TYPE_NO_RC)
+			g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
+				id, id_type, rc_type, mmfault);
 	} else {
 		if (mmfault->fault_type == gmmu_fault_type_pte_v()) {
 			nvgpu_log(g, gpu_dbg_intr, "invalid pte! try to fix");
@@ -972,7 +992,10 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
 		/* refch in mmfault is assigned at the time of copying
 		 * fault info from snap reg or bar2 fault buf
 		 */
-		gk20a_channel_put(mmfault->refch);
+		if (mmfault->refch) {
+			gk20a_channel_put(mmfault->refch);
+			mmfault->refch = NULL;
+		}
 	}
 }
 
@@ -1061,8 +1084,10 @@ void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g,
 			next_fault_addr = mmfault->fault_addr;
 			if (prev_fault_addr == next_fault_addr) {
 				nvgpu_log(g, gpu_dbg_intr, "pte already scanned");
-				if (mmfault->refch)
+				if (mmfault->refch) {
 					gk20a_channel_put(mmfault->refch);
+					mmfault->refch = NULL;
+				}
 				continue;
 			}
 		}
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 8f0f6b0c..6df1d343 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -652,29 +652,19 @@ static void gv11b_reset_pbdma_faulted_tsg(struct tsg_gk20a *tsg)
 }
 
 void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
-			struct channel_gk20a *refch,
+			struct tsg_gk20a *tsg,
 			u32 faulted_pbdma, u32 faulted_engine)
 {
-	struct tsg_gk20a *tsg;
+	if (!tsg)
+		return;
 
 	nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
 				faulted_pbdma, faulted_engine);
 
-	if (!refch)
-		return;
-
-	if (gk20a_is_channel_marked_as_tsg(refch)) {
-		tsg = &g->fifo.tsg[refch->tsgid];
-		if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
-			gv11b_reset_pbdma_faulted_tsg(tsg);
-		if (faulted_engine != FIFO_INVAL_ENGINE_ID)
-			gv11b_reset_eng_faulted_tsg(tsg);
-	} else {
-		if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
-			gv11b_reset_pbdma_faulted_ch(g, refch->chid);
-		if (faulted_engine != FIFO_INVAL_ENGINE_ID)
-			gv11b_reset_eng_faulted_ch(g, refch->chid);
-	}
+	if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
+		gv11b_reset_pbdma_faulted_tsg(tsg);
+	if (faulted_engine != FIFO_INVAL_ENGINE_ID)
+		gv11b_reset_eng_faulted_tsg(tsg);
 }
 
 static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
@@ -992,12 +982,74 @@ int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
 
 }
 
+static void gv11b_fifo_locked_abort_runlist_active_tsgs(struct gk20a *g,
+			unsigned int rc_type,
+			u32 runlists_mask)
+{
+	bool verbose = false;
+	struct tsg_gk20a *tsg = NULL;
+	u32 rlid, tsgid;
+	struct fifo_runlist_info_gk20a *runlist = NULL;
+	u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+	u32 mutex_ret = 0;
+	bool add = false, wait_for_finish = false;
+	int err;
+
+	nvgpu_err(g, "runlist id unknown, abort active tsgs in runlists");
+
+	/* runlist_lock  are locked by teardown */
+	mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+
+	for (rlid = 0; rlid < g->fifo.max_runlists;
+						 rlid++) {
+		if (!(runlists_mask & BIT(rlid)))
+			continue;
+		nvgpu_log(g, gpu_dbg_info, "abort runlist id %d",
+				rlid);
+		runlist = &g->fifo.runlist_info[rlid];
+
+		for_each_set_bit(tsgid, runlist->active_tsgs,
+			g->fifo.num_channels) {
+			nvgpu_log(g, gpu_dbg_info, "abort tsg id %d", tsgid);
+			tsg = &g->fifo.tsg[tsgid];
+			gk20a_disable_tsg(tsg);
+
+			/* assume all pbdma and eng faulted are set */
+			nvgpu_log(g, gpu_dbg_info, "reset pbdma and eng faulted");
+			gv11b_reset_pbdma_faulted_tsg(tsg);
+			gv11b_reset_eng_faulted_tsg(tsg);
+
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+			gk20a_ctxsw_trace_tsg_reset(g, tsg);
+#endif
+			if (!g->fifo.deferred_reset_pending) {
+				if (rc_type == RC_TYPE_MMU_FAULT) {
+					gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
+					verbose = gk20a_fifo_error_tsg(g, tsg);
+				}
+			}
+
+			/* (chid == ~0 && !add) remove all act ch from runlist*/
+			err = gk20a_fifo_update_runlist_locked(g, rlid,
+					FIFO_INVAL_CHANNEL_ID, add, wait_for_finish);
+			if (err)
+				nvgpu_err(g, "runlist id %d is not cleaned up",
+					rlid);
+
+			gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
+
+			nvgpu_log(g, gpu_dbg_info, "aborted tsg id %d", tsgid);
+		}
+	}
+	if (!mutex_ret)
+		nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+}
+
 void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 			u32 id, unsigned int id_type, unsigned int rc_type,
 			 struct mmu_fault_info *mmfault)
 {
 	struct tsg_gk20a *tsg = NULL;
-	struct channel_gk20a *refch = NULL;
 	u32 runlists_mask, rlid;
 	struct fifo_runlist_info_gk20a *runlist = NULL;
 	u32 engine_id, client_type = ~0;
@@ -1022,21 +1074,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 		} else {
 			nvgpu_log_fn(g, "id type is tsg but tsg id is inval");
 		}
-	} else if (id_type == ID_TYPE_CHANNEL) {
-		if (id != FIFO_INVAL_CHANNEL_ID) {
-			runlist_id = f->channel[id].runlist_id;
-			if (runlist_id != FIFO_INVAL_RUNLIST_ID)
-				num_runlists++;
-			else
-				nvgpu_log_fn(g, "ch runlist id is invalid");
-
-			if ((u32)f->channel[id].tsgid != FIFO_INVAL_TSG_ID)
-				tsg = &f->tsg[f->channel[id].tsgid];
-			else
-				nvgpu_log_fn(g, "tsgid for ch is invalid");
-		} else {
-			nvgpu_log_fn(g, "id type is ch but ch id is inval");
-		}
 	} else {
 		/*
 		 * id type is unknown, get runlist_id if eng mask is such that
@@ -1103,27 +1140,16 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 
 	gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);
 
-	/* Get tsg/ch */
 	if (rc_type == RC_TYPE_MMU_FAULT) {
 		gk20a_debug_dump(g);
-		refch = mmfault->refch;
 		client_type = mmfault->client_type;
-		gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch,
-			mmfault->faulted_pbdma,
-			mmfault->faulted_engine);
+		gv11b_fifo_reset_pbdma_and_eng_faulted(g, tsg,
+				mmfault->faulted_pbdma,
+				mmfault->faulted_engine);
 	}
 
-	if (id_type == ID_TYPE_TSG) {
-		tsg = &g->fifo.tsg[id];
-	} else if (id_type == ID_TYPE_CHANNEL) {
-		if (refch == NULL)
-			refch = gk20a_channel_get(&g->fifo.channel[id]);
-	}
-	/* Disable tsg/ch */
 	if (tsg)
 		gk20a_disable_tsg(tsg);
-	else if (refch)
-		g->ops.fifo.disable_channel(refch);
 
 	/*
 	 * Even though TSG preempt timed out, the RC sequence would by design
@@ -1134,7 +1160,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 	 * that all PBDMAs serving the engine are not loaded when engine is
 	 * reset.
 	 */
-	if (id_type == ID_TYPE_TSG || id_type == ID_TYPE_CHANNEL) {
+	if (tsg) {
 		int preempt_failed;
 
 		preempt_failed = g->ops.fifo.preempt_ch_tsg(g, id, id_type);
@@ -1156,7 +1182,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 
 			for_each_set_bit(engine_id, &__reset_eng_bitmask,
 							g->fifo.max_engines) {
-				if ((refch || tsg) &&
+				if (tsg &&
 					 gk20a_fifo_should_defer_engine_reset(g,
 					engine_id, client_type, false)) {
 
@@ -1188,13 +1214,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 	}
 
 #ifdef CONFIG_GK20A_CTXSW_TRACE
-	/* tsg and refch both could be valid for mmu fault. Check tsg first */
 	if (tsg)
 		gk20a_ctxsw_trace_tsg_reset(g, tsg);
-	else if (refch)
-		gk20a_ctxsw_trace_channel_reset(g, refch);
 #endif
-
 	if (tsg) {
 		if (g->fifo.deferred_reset_pending) {
 			gk20a_disable_tsg(tsg);
@@ -1204,26 +1226,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 
 			gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
 		}
-		if (refch)
-			gk20a_channel_put(refch);
-	} else if (refch) {
-		if (g->fifo.deferred_reset_pending) {
-			g->ops.fifo.disable_channel(refch);
-		} else {
-			if (rc_type == RC_TYPE_MMU_FAULT)
-				gk20a_fifo_set_ctx_mmu_error_ch(g, refch);
-
-			gk20a_channel_abort(refch, false);
-		}
-		gk20a_channel_put(refch);
 	} else {
-		nvgpu_err(g, "id unknown, abort runlist");
-		for (rlid = 0; rlid < g->fifo.max_runlists;
-						 rlid++) {
-			if (runlists_mask & BIT(rlid))
-				g->ops.fifo.update_runlist(g, rlid,
-					 FIFO_INVAL_CHANNEL_ID, false, true);
-		}
+		gv11b_fifo_locked_abort_runlist_active_tsgs(g, rc_type,
+			runlists_mask);
 	}
 
 	gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED);
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
index 3dfc337c..aee7aef2 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
@@ -56,7 +56,7 @@
 struct gpu_ops;
 
 void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
-			struct channel_gk20a *refch,
+			struct tsg_gk20a *tsg,
 			u32 faulted_pbdma, u32 faulted_engine);
 void gv11b_mmu_fault_id_to_eng_pbdma_id_and_veid(struct gk20a *g,
 	u32 mmu_fault_id, u32 *active_engine_id, u32 *veid, u32 *pbdma_id);
-- 
cgit v1.2.2