From 98d996f4ffb0137d119b5849cae46d7b7e5693e1 Mon Sep 17 00:00:00 2001
From: Aparna Das <aparnad@nvidia.com>
Date: Mon, 4 Jun 2018 19:40:56 -0700
Subject: gpu: nvgpu: recover on first interrupt reported for mmu nack

In case of mmu nack error interrupt is received twice through SM
reported mmu nack interrupt and mmu fault in undertermined order.
Recover on the first received interrupt to avoid semaphore release
and skip doing a second recovery.

Also fix NULL pointer dereference in function
gv11b_fifo_reset_pbdma_and_eng_faulted when channel reference is
invalid in teardown path.

Bug 200382235

Change-Id: I361a5725d7b6355ebf02b2870727f647fbd7a37e
Signed-off-by: Aparna Das <aparnad@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1739804
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c |  2 ++
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h |  2 ++
 drivers/gpu/nvgpu/gv11b/fb_gv11b.c      | 25 +++++++++++++++++++++++
 drivers/gpu/nvgpu/gv11b/fifo_gv11b.c    |  3 +++
 drivers/gpu/nvgpu/gv11b/gr_gv11b.c      | 35 ++++++++++++++++++++-------------
 5 files changed, 53 insertions(+), 14 deletions(-)

(limited to 'drivers')

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 551e8b04..4e6837ef 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -684,6 +684,8 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
 	ch->subctx_id = 0;
 	ch->runqueue_sel = 0;
 
+	ch->mmu_nack_handled = false;
+
 	/* The channel is *not* runnable at this point. It still needs to have
 	 * an address space bound and allocate a gpfifo and grctx. */
 
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index f95184be..aa37db62 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -295,6 +295,8 @@ struct channel_gk20a {
 
 	/* Any operating system specific data. */
 	void *os_priv;
+
+	bool mmu_nack_handled;
 };
 
 static inline struct channel_gk20a *
diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
index ce8f5669..bba7e66c 100644
--- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
@@ -935,12 +935,37 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
 				id = mmfault->chid;
 				id_type = ID_TYPE_CHANNEL;
 			}
+			if (mmfault->refch->mmu_nack_handled) {
+				/* We have already recovered for the same
+				 * context, skip doing another recovery.
+				 */
+				mmfault->refch->mmu_nack_handled = false;
+				/*
+				 * Recovery path can be entered twice for the
+				 * same error in case of mmu nack. If mmu
+				 * nack interrupt is handled before mmu fault
+				 * then channel reference is increased to avoid
+				 * closing the channel by userspace. Decrement
+				 * channel reference.
+				 */
+				gk20a_channel_put(mmfault->refch);
+				/* refch in mmfault is assigned at the time
+				 * of copying fault info from snap reg or bar2
+				 * fault buf.
+				 */
+				gk20a_channel_put(mmfault->refch);
+				return;
+			}
 		} else {
 			id_type = ID_TYPE_UNKNOWN;
 		}
 		if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
 			act_eng_bitmask = BIT(mmfault->faulted_engine);
 
+		/* Indicate recovery is handled if mmu fault is a result of
+		 * mmu nack.
+		 */
+		mmfault->refch->mmu_nack_handled = true;
 		g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
 			id, id_type, RC_TYPE_MMU_FAULT, mmfault);
 	} else {
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 84d63b91..9c64675f 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -596,6 +596,9 @@ void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
 	nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
 				faulted_pbdma, faulted_engine);
 
+	if (!refch)
+		return;
+
 	if (gk20a_is_channel_marked_as_tsg(refch)) {
 		tsg = &g->fifo.tsg[refch->tsgid];
 		if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 694ff8ad..aed45ceb 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2093,23 +2093,30 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
 	u32 warp_esr_error,
 	struct channel_gk20a *fault_ch)
 {
-	struct tsg_gk20a *tsg;
 	u32 offset;
+	int err = 0;
 
+	fault_ch = gk20a_channel_get(fault_ch);
 	if (fault_ch) {
-		tsg = &g->fifo.tsg[fault_ch->tsgid];
-
-		/*
-		 * Upon receiving MMU_FAULT error, MMU will forward MMU_NACK
-		 * to SM. So MMU_FAULT handling path will take care of
-		 * triggering RC recovery
-		 *
-		 * In MMU_NACK handling path, we just set the error notifier
-		 * and clear the interrupt so that the User Space sees the error
-		 * as soon as semaphores are released by SM
-		 */
-		gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
+		if (!fault_ch->mmu_nack_handled) {
+			/* recovery is not done for the channel implying mmu
+			 * nack interrupt is serviced before mmu fault. Force
+			 * recovery by returning an error. Also indicate we
+			 * should skip a second recovery.
+			 */
+			fault_ch->mmu_nack_handled = true;
+			err = -EFAULT;
+		}
 	}
+	/* else mmu fault is serviced first and channel is closed */
+
+	/* do not release reference to ch as we do not want userspace to close
+	 * this channel on recovery. Otherwise mmu fault handler will enter
+	 * recovery path even if channel is invalid. We want to explicitly check
+	 * for teardown value in mmu fault handler.
+	 */
+	if (!err)
+		gk20a_channel_put(fault_ch);
 
 	/* clear interrupt */
 	offset = gk20a_gr_gpc_offset(g, gpc) +
@@ -2122,7 +2129,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
 			"ESR %s(0x%x)",
 			"MMU NACK ERROR",
 			warp_esr_error);
-	return 0;
+	return err;
 }
 
 static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
-- 
cgit v1.2.2