From 0d088ad70cb43e54661163971095409c76a79f51 Mon Sep 17 00:00:00 2001
From: Sagar Kamble <skamble@nvidia.com>
Date: Mon, 3 May 2021 23:17:16 +0530
Subject: gpu: nvgpu: wait for stalling interrupts to complete during TSG
 unbind preempt

Some of the engine stalling interrupts can block the context save off
the engine if not handled during fifo.preempt_tsg. They need to be
handled while polling for engine ctxsw status.

Bug 200711183
Bug 200726848

Change-Id: Ie45d76d9d1d8be3ffb842670843507f2d9aea6d0
Signed-off-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2521971
(cherry picked from commit I7418a9e0354013b81fbefd8c0cab5068404fc44e)
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2523938
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-by: Bibek Basu <bbasu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
---
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c    | 24 +++++---
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.h    |  7 ++-
 drivers/gpu/nvgpu/gv11b/fifo_gv11b.c    | 97 ++++++++++++++++++++++-----------
 drivers/gpu/nvgpu/gv11b/fifo_gv11b.h    |  4 +-
 drivers/gpu/nvgpu/include/nvgpu/gk20a.h |  2 +-
 5 files changed, 90 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index e91830f8..049b8da2 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -2981,7 +2981,7 @@ static u32 gk20a_fifo_get_preempt_timeout(struct gk20a *g)
 }
 
 int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id,
-		unsigned int id_type)
+		unsigned int id_type, bool preempt_retries_left)
 {
 	struct nvgpu_timeout timeout;
 	u32 delay = GR_IDLE_CHECK_DEFAULT;
@@ -3037,7 +3037,8 @@ void gk20a_fifo_preempt_timeout_rc(struct gk20a *g, struct channel_gk20a *ch)
 					RC_TYPE_PREEMPT_TIMEOUT);
 }
 
-int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
+int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg,
+			  bool preempt_retries_left)
 {
 	int ret;
 	unsigned int id_type;
@@ -3049,8 +3050,17 @@ int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 
 	id_type = is_tsg ? ID_TYPE_TSG : ID_TYPE_CHANNEL;
 
-	/* wait for preempt */
-	ret = g->ops.fifo.is_preempt_pending(g, id, id_type);
+	/*
+	 * Poll for preempt done. if stalling interrupts are pending
+	 * while preempt is in progress we poll for stalling interrupts
+	 * to finish based on return value from this function and
+	 * retry preempt again.
+	 * If HW is hung, on the last retry instance we try to identify
+	 * the engines hung and set the runlist reset_eng_bitmask
+	 * and mark preemption completion.
+	 */
+	ret = g->ops.fifo.is_preempt_pending(g, id, id_type,
+					     preempt_retries_left);
 
 	return ret;
 }
@@ -3072,7 +3082,7 @@ int gk20a_fifo_preempt_channel(struct gk20a *g, struct channel_gk20a *ch)
 
 	mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
 
-	ret = __locked_fifo_preempt(g, ch->chid, false);
+	ret = __locked_fifo_preempt(g, ch->chid, false, false);
 
 	if (!mutex_ret) {
 		nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
@@ -3112,7 +3122,7 @@ int gk20a_fifo_preempt_tsg(struct gk20a *g, struct tsg_gk20a *tsg)
 
 	mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
 
-	ret = __locked_fifo_preempt(g, tsg->tsgid, true);
+	ret = __locked_fifo_preempt(g, tsg->tsgid, true, false);
 
 	if (!mutex_ret) {
 		nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
@@ -3785,7 +3795,7 @@ static int __locked_fifo_reschedule_preempt_next(struct channel_gk20a *ch,
 		gk20a_readl(g, fifo_preempt_r()));
 #endif
 	if (wait_preempt) {
-		g->ops.fifo.is_preempt_pending(g, preempt_id, preempt_type);
+		g->ops.fifo.is_preempt_pending(g, preempt_id, preempt_type, false);
 	}
 #ifdef TRACEPOINTS_ENABLED
 	trace_gk20a_reschedule_preempted_next(ch->chid);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 26365cae..078236d0 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -1,7 +1,7 @@
 /*
  * GK20A graphics fifo (gr host)
  *
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -388,8 +388,9 @@ void gk20a_fifo_channel_unbind(struct channel_gk20a *ch_gk20a);
 u32 gk20a_fifo_intr_0_error_mask(struct gk20a *g);
 
 int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id,
-			unsigned int id_type);
-int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg);
+		unsigned int id_type, bool preempt_retries_left);
+int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg,
+			  bool preempt_retries_left);
 void gk20a_fifo_preempt_timeout_rc_tsg(struct gk20a *g, struct tsg_gk20a *tsg);
 void gk20a_fifo_preempt_timeout_rc(struct gk20a *g, struct channel_gk20a *ch);
 int gk20a_fifo_setup_ramfc(struct channel_gk20a *c,
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index be4d56a8..cc43ee33 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -1,7 +1,7 @@
 /*
  * GV11B fifo
  *
- * Copyright (c) 2015-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -498,7 +498,8 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
 }
 
 static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
-			 u32 act_eng_id, u32 *reset_eng_bitmask)
+			 u32 act_eng_id, u32 *reset_eng_bitmask,
+			 bool preempt_retries_left)
 {
 	struct nvgpu_timeout timeout;
 	unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */
@@ -507,6 +508,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
 	int ret = -EBUSY;
 	unsigned int loop_count = 0;
 	u32 eng_intr_pending;
+	bool check_preempt_retry = false;
 
 	/* timeout in milli seconds */
 	nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -565,9 +567,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
 			 fifo_engine_status_ctx_status_ctxsw_switch_v()) {
 			/* Eng save hasn't started yet. Continue polling */
 			if (eng_intr_pending) {
-				/* if eng intr, stop polling */
-				*reset_eng_bitmask |= BIT(act_eng_id);
-				ret = 0;
+				check_preempt_retry = true;
 				break;
 			}
 
@@ -578,9 +578,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
 
 			if (id == fifo_engine_status_id_v(eng_stat)) {
 				if (eng_intr_pending) {
-					/* preemption will not finish */
-					*reset_eng_bitmask |= BIT(act_eng_id);
-					ret = 0;
+					check_preempt_retry = true;
 					break;
 				}
 			} else {
@@ -594,9 +592,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
 
 			if (id == fifo_engine_status_next_id_v(eng_stat)) {
 				if (eng_intr_pending) {
-					/* preemption will not finish */
-					*reset_eng_bitmask |= BIT(act_eng_id);
-					ret = 0;
+					check_preempt_retry = true;
 					break;
 				}
 			} else {
@@ -606,8 +602,13 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
 			}
 
 		} else {
-			/* Preempt should be finished */
-			ret = 0;
+			if (eng_intr_pending) {
+				check_preempt_retry = true;
+			} else {
+				/* Preempt should be finished */
+				ret = 0;
+			}
+
 			break;
 		}
 		nvgpu_usleep_range(delay, delay * 2);
@@ -615,7 +616,19 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
 				delay << 1, GR_IDLE_CHECK_MAX);
 	} while (!nvgpu_timeout_expired(&timeout));
 
-	if (ret) {
+
+	/* if eng intr, stop polling and check if we can retry preempts. */
+	if (check_preempt_retry) {
+		if (preempt_retries_left) {
+			ret = -EAGAIN;
+		} else {
+			/* preemption will not finish */
+			*reset_eng_bitmask |= BIT32(act_eng_id);
+			ret = 0;
+		}
+	}
+
+	if (ret && ret != -EAGAIN) {
 		/*
 		* The reasons a preempt can fail are:
 		* 1.Some other stalling interrupt is asserted preventing
@@ -770,7 +783,7 @@ static void gv11b_fifo_issue_runlist_preempt(struct gk20a *g,
 }
 
 int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
-		 unsigned int id_type)
+		 unsigned int id_type, bool preempt_retries_left)
 {
 	struct fifo_gk20a *f = &g->fifo;
 	unsigned long runlist_served_pbdmas;
@@ -778,7 +791,7 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
 	u32 pbdma_id;
 	u32 act_eng_id;
 	u32 runlist_id;
-	int ret = 0;
+	int err, ret = 0;
 	u32 tsgid;
 
 	if (id_type == ID_TYPE_TSG) {
@@ -795,14 +808,21 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
 	runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask;
 
 	for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) {
-		ret |= gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id);
+		err = gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id);
+		if (err != 0) {
+			ret = err;
+		}
 	}
 
 	f->runlist_info[runlist_id].reset_eng_bitmask = 0;
 
 	for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) {
-		ret |= gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
-				&f->runlist_info[runlist_id].reset_eng_bitmask);
+		err = gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
+				&f->runlist_info[runlist_id].reset_eng_bitmask,
+				preempt_retries_left);
+		if ((err != 0) && (ret == 0)) {
+			ret = err;
+		}
 	}
 	return ret;
 }
@@ -847,10 +867,13 @@ int gv11b_fifo_enable_tsg(struct tsg_gk20a *tsg)
 int gv11b_fifo_preempt_tsg(struct gk20a *g, struct tsg_gk20a *tsg)
 {
 	struct fifo_gk20a *f = &g->fifo;
-	u32 ret = 0;
+	int ret = 0;
 	u32 token = PMU_INVALID_MUTEX_OWNER_ID;
 	u32 mutex_ret = 0;
 	u32 runlist_id;
+	u32 preempt_retry_count = 10U;
+	u32 preempt_retry_timeout =
+		g->ops.fifo.get_preempt_timeout(g) / preempt_retry_count;
 
 	nvgpu_log_fn(g, "tsgid: %d", tsg->tsgid);
 
@@ -860,23 +883,35 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, struct tsg_gk20a *tsg)
 		return 0;
 	}
 
-	nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock);
+	do {
+		nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock);
 
-	/* WAR for Bug 2065990 */
-	gk20a_fifo_disable_tsg_sched(g, tsg);
+		/* WAR for Bug 2065990 */
+		gk20a_fifo_disable_tsg_sched(g, tsg);
 
-	mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+		mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
 
-	ret = __locked_fifo_preempt(g, tsg->tsgid, true);
+		ret = __locked_fifo_preempt(g, tsg->tsgid, true,
+					    preempt_retry_count > 1U);
 
-	if (!mutex_ret) {
-		nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
-	}
+		if (!mutex_ret) {
+			nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+		}
+
+		/* WAR for Bug 2065990 */
+		gk20a_fifo_enable_tsg_sched(g, tsg);
 
-	/* WAR for Bug 2065990 */
-	gk20a_fifo_enable_tsg_sched(g, tsg);
+		nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
 
-	nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
+		if (ret != -EAGAIN) {
+			break;
+		}
+
+		ret = nvgpu_wait_for_stall_interrupts(g, preempt_retry_timeout);
+		if (ret != 0) {
+			nvgpu_log_info(g, "wait for stall interrupts failed %d", ret);
+		}
+	} while (--preempt_retry_count != 0U);
 
 	if (ret) {
 		if (nvgpu_platform_is_silicon(g)) {
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
index abbf77a6..4e6bd6ba 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
@@ -1,7 +1,7 @@
 /*
  * GV11B Fifo
  *
- * Copyright (c) 2016-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -80,7 +80,7 @@ void gv11b_dump_eng_status(struct gk20a *g,
 u32 gv11b_fifo_intr_0_error_mask(struct gk20a *g);
 int gv11b_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next);
 int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
-		 unsigned int id_type);
+		 unsigned int id_type, bool preempt_retries_left);
 int gv11b_fifo_preempt_channel(struct gk20a *g, struct channel_gk20a *ch);
 int gv11b_fifo_preempt_tsg(struct gk20a *g, struct tsg_gk20a *tsg);
 int gv11b_fifo_enable_tsg(struct tsg_gk20a *tsg);
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
index 3b193dbe..d6d6e939 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -719,7 +719,7 @@ struct gpu_ops {
 				struct ch_state *ch_state);
 		u32 (*intr_0_error_mask)(struct gk20a *g);
 		int (*is_preempt_pending)(struct gk20a *g, u32 id,
-			unsigned int id_type);
+			unsigned int id_type, bool preempt_retries_left);
 		void (*init_pbdma_intr_descs)(struct fifo_gk20a *f);
 		int (*reset_enable_hw)(struct gk20a *g);
 		int (*setup_userd)(struct channel_gk20a *c);
-- 
cgit v1.2.2