5 files changed, 53 insertions, 14 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 551e8b04..4e6837ef 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -684,6 +684,8 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
        ch->subctx_id = 0;
        ch->runqueue_sel = 0;
+        ch->mmu_nack_handled = false;
        /* The channel is *not* runnable at this point. It still needs to have
         * an address space bound and allocate a gpfifo and grctx. */
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index f95184be..aa37db62 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -295,6 +295,8 @@ struct channel_gk20a {
        /* Any operating system specific data. */
        void *os_priv;
+        bool mmu_nack_handled;
 };
 static inline struct channel_gk20a *
diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
index ce8f5669..bba7e66c 100644
--- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
@@ -935,12 +935,37 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                                id = mmfault->chid;
                                id_type = ID_TYPE_CHANNEL;
                        }
+                        if (mmfault->refch->mmu_nack_handled) {
+                                /* We have already recovered for the same
+                                 * context, skip doing another recovery.
+                                 */
+                                mmfault->refch->mmu_nack_handled = false;
+                                /*
+                                 * Recovery path can be entered twice for the
+                                 * same error in case of mmu nack. If mmu
+                                 * nack interrupt is handled before mmu fault
+                                 * then channel reference is increased to avoid
+                                 * closing the channel by userspace. Decrement
+                                 * channel reference.
+                                 */
+                                gk20a_channel_put(mmfault->refch);
+                                /* refch in mmfault is assigned at the time
+                                 * of copying fault info from snap reg or bar2
+                                 * fault buf.
+                                 */
+                                gk20a_channel_put(mmfault->refch);
+                                return;
+                        }
                } else {
                        id_type = ID_TYPE_UNKNOWN;
                }
                if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
                        act_eng_bitmask = BIT(mmfault->faulted_engine);
+                /* Indicate recovery is handled if mmu fault is a result of
+                 * mmu nack.
+                 */
+                mmfault->refch->mmu_nack_handled = true;
                g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
                        id, id_type, RC_TYPE_MMU_FAULT, mmfault);
        } else {
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 84d63b91..9c64675f 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -596,6 +596,9 @@ void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
        nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
                                faulted_pbdma, faulted_engine);
+        if (!refch)
+                return;
        if (gk20a_is_channel_marked_as_tsg(refch)) {
                tsg = &g->fifo.tsg[refch->tsgid];
                if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 694ff8ad..aed45ceb 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2093,23 +2093,30 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
        u32 warp_esr_error,
        struct channel_gk20a *fault_ch)
 {
-        struct tsg_gk20a *tsg;
        u32 offset;
+        int err = 0;
+        fault_ch = gk20a_channel_get(fault_ch);
        if (fault_ch) {
-                tsg = &g->fifo.tsg[fault_ch->tsgid];
+                if (!fault_ch->mmu_nack_handled) {
+                        /* recovery is not done for the channel implying mmu
-                /*
+                         * nack interrupt is serviced before mmu fault. Force
-                 * Upon receiving MMU_FAULT error, MMU will forward MMU_NACK
+                         * recovery by returning an error. Also indicate we
-                 * to SM. So MMU_FAULT handling path will take care of
+                         * should skip a second recovery.
-                 * triggering RC recovery
+                         */
-                 *
+                        fault_ch->mmu_nack_handled = true;
-                 * In MMU_NACK handling path, we just set the error notifier
+                        err = -EFAULT;
-                 * and clear the interrupt so that the User Space sees the error
+                }
-                 * as soon as semaphores are released by SM
-                 */
-                gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
        }
+        /* else mmu fault is serviced first and channel is closed */
+        /* do not release reference to ch as we do not want userspace to close
+         * this channel on recovery. Otherwise mmu fault handler will enter
+         * recovery path even if channel is invalid. We want to explicitly check
+         * for teardown value in mmu fault handler.
+         */
+        if (!err)
+                gk20a_channel_put(fault_ch);
        /* clear interrupt */
        offset = gk20a_gr_gpc_offset(g, gpc) +
@@ -2122,7 +2129,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
                        "ESR %s(0x%x)",
                        "MMU NACK ERROR",
                        warp_esr_error);
-        return 0;
+        return err;
 }
 static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 551e8b04..4e6837ef 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -684,6 +684,8 @@ struct channel_gk20a gk20a_open_new_channel(struct gk20a g,
684	ch->subctx_id = 0;	684	ch->subctx_id = 0;
685	ch->runqueue_sel = 0;	685	ch->runqueue_sel = 0;
686		686
		687	ch->mmu_nack_handled = false;
		688
687	/* The channel is not runnable at this point. It still needs to have	689	/* The channel is not runnable at this point. It still needs to have
688	* an address space bound and allocate a gpfifo and grctx. */	690	* an address space bound and allocate a gpfifo and grctx. */
689		691


diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index f95184be..aa37db62 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -295,6 +295,8 @@ struct channel_gk20a {
295		295
296	/* Any operating system specific data. */	296	/* Any operating system specific data. */
297	void *os_priv;	297	void *os_priv;
		298
		299	bool mmu_nack_handled;
298	};	300	};
299		301
300	static inline struct channel_gk20a *	302	static inline struct channel_gk20a *


diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c index ce8f5669..bba7e66c 100644 --- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
@@ -935,12 +935,37 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
935	id = mmfault->chid;	935	id = mmfault->chid;
936	id_type = ID_TYPE_CHANNEL;	936	id_type = ID_TYPE_CHANNEL;
937	}	937	}
		938	if (mmfault->refch->mmu_nack_handled) {
		939	/* We have already recovered for the same
		940	* context, skip doing another recovery.
		941	*/
		942	mmfault->refch->mmu_nack_handled = false;
		943	/*
		944	* Recovery path can be entered twice for the
		945	* same error in case of mmu nack. If mmu
		946	* nack interrupt is handled before mmu fault
		947	* then channel reference is increased to avoid
		948	* closing the channel by userspace. Decrement
		949	* channel reference.
		950	*/
		951	gk20a_channel_put(mmfault->refch);
		952	/* refch in mmfault is assigned at the time
		953	* of copying fault info from snap reg or bar2
		954	* fault buf.
		955	*/
		956	gk20a_channel_put(mmfault->refch);
		957	return;
		958	}
938	} else {	959	} else {
939	id_type = ID_TYPE_UNKNOWN;	960	id_type = ID_TYPE_UNKNOWN;
940	}	961	}
941	if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)	962	if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
942	act_eng_bitmask = BIT(mmfault->faulted_engine);	963	act_eng_bitmask = BIT(mmfault->faulted_engine);
943		964
		965	/* Indicate recovery is handled if mmu fault is a result of
		966	* mmu nack.
		967	*/
		968	mmfault->refch->mmu_nack_handled = true;
944	g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,	969	g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
945	id, id_type, RC_TYPE_MMU_FAULT, mmfault);	970	id, id_type, RC_TYPE_MMU_FAULT, mmfault);
946	} else {	971	} else {


diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index 84d63b91..9c64675f 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -596,6 +596,9 @@ void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
596	nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",	596	nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
597	faulted_pbdma, faulted_engine);	597	faulted_pbdma, faulted_engine);
598		598
		599	if (!refch)
		600	return;
		601
599	if (gk20a_is_channel_marked_as_tsg(refch)) {	602	if (gk20a_is_channel_marked_as_tsg(refch)) {
600	tsg = &g->fifo.tsg[refch->tsgid];	603	tsg = &g->fifo.tsg[refch->tsgid];
601	if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)	604	if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)


diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 694ff8ad..aed45ceb 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2093,23 +2093,30 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2093	u32 warp_esr_error,	2093	u32 warp_esr_error,
2094	struct channel_gk20a *fault_ch)	2094	struct channel_gk20a *fault_ch)
2095	{	2095	{
2096	struct tsg_gk20a *tsg;
2097	u32 offset;	2096	u32 offset;
		2097	int err = 0;
2098		2098
		2099	fault_ch = gk20a_channel_get(fault_ch);
2099	if (fault_ch) {	2100	if (fault_ch) {
2100	tsg = &g->fifo.tsg[fault_ch->tsgid];	2101	if (!fault_ch->mmu_nack_handled) {
2101		2102	/* recovery is not done for the channel implying mmu
2102	/*	2103	* nack interrupt is serviced before mmu fault. Force
2103	* Upon receiving MMU_FAULT error, MMU will forward MMU_NACK	2104	* recovery by returning an error. Also indicate we
2104	* to SM. So MMU_FAULT handling path will take care of	2105	* should skip a second recovery.
2105	* triggering RC recovery	2106	*/
2106	*	2107	fault_ch->mmu_nack_handled = true;
2107	* In MMU_NACK handling path, we just set the error notifier	2108	err = -EFAULT;
2108	* and clear the interrupt so that the User Space sees the error	2109	}
2109	* as soon as semaphores are released by SM
2110	*/
2111	gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
2112	}	2110	}
		2111	/* else mmu fault is serviced first and channel is closed */
		2112
		2113	/* do not release reference to ch as we do not want userspace to close
		2114	* this channel on recovery. Otherwise mmu fault handler will enter
		2115	* recovery path even if channel is invalid. We want to explicitly check
		2116	* for teardown value in mmu fault handler.
		2117	*/
		2118	if (!err)
		2119	gk20a_channel_put(fault_ch);
2113		2120
2114	/* clear interrupt */	2121	/* clear interrupt */
2115	offset = gk20a_gr_gpc_offset(g, gpc) +	2122	offset = gk20a_gr_gpc_offset(g, gpc) +
@@ -2122,7 +2129,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2122	"ESR %s(0x%x)",	2129	"ESR %s(0x%x)",
2123	"MMU NACK ERROR",	2130	"MMU NACK ERROR",
2124	warp_esr_error);	2131	warp_esr_error);
2125	return 0;	2132	return err;
2126	}	2133	}
2127		2134
2128	static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)	2135	static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)