gpu: nvgpu: recover on first interrupt reported for mmu nack

In case of mmu nack error interrupt is received twice through SM reported mmu nack interrupt and mmu fault in undertermined order. Recover on the first received interrupt to avoid semaphore release and skip doing a second recovery. Also fix NULL pointer dereference in function gv11b_fifo_reset_pbdma_and_eng_faulted when channel reference is invalid in teardown path. Bug 200382235 Change-Id: I361a5725d7b6355ebf02b2870727f647fbd7a37e Signed-off-by: Aparna Das <aparnad@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1739804 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Aparna Das <aparnad@nvidia.com> 2018-06-04 22:40:56 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2018-06-15 20:47:06 -0400
commit: 98d996f4ffb0137d119b5849cae46d7b7e5693e1 (patch)
tree: 4e3243b91e8e559dd14b8bc30fc2468713182027
parent: 1f51620fda57443c77506c354af837a60883d78b (diff)
5 files changed, 53 insertions, 14 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 551e8b04..4e6837ef 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -684,6 +684,8 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
        ch->subctx_id = 0;
        ch->runqueue_sel = 0;
+        ch->mmu_nack_handled = false;
        /* The channel is *not* runnable at this point. It still needs to have
         * an address space bound and allocate a gpfifo and grctx. */
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index f95184be..aa37db62 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -295,6 +295,8 @@ struct channel_gk20a {
        /* Any operating system specific data. */
        void *os_priv;
+        bool mmu_nack_handled;
 };
 static inline struct channel_gk20a *
diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
index ce8f5669..bba7e66c 100644
--- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
@@ -935,12 +935,37 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                                id = mmfault->chid;
                                id_type = ID_TYPE_CHANNEL;
                        }
+                        if (mmfault->refch->mmu_nack_handled) {
+                                /* We have already recovered for the same
+                                 * context, skip doing another recovery.
+                                 */
+                                mmfault->refch->mmu_nack_handled = false;
+                                /*
+                                 * Recovery path can be entered twice for the
+                                 * same error in case of mmu nack. If mmu
+                                 * nack interrupt is handled before mmu fault
+                                 * then channel reference is increased to avoid
+                                 * closing the channel by userspace. Decrement
+                                 * channel reference.
+                                 */
+                                gk20a_channel_put(mmfault->refch);
+                                /* refch in mmfault is assigned at the time
+                                 * of copying fault info from snap reg or bar2
+                                 * fault buf.
+                                 */
+                                gk20a_channel_put(mmfault->refch);
+                                return;
+                        }
                } else {
                        id_type = ID_TYPE_UNKNOWN;
                }
                if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
                        act_eng_bitmask = BIT(mmfault->faulted_engine);
+                /* Indicate recovery is handled if mmu fault is a result of
+                 * mmu nack.
+                 */
+                mmfault->refch->mmu_nack_handled = true;
                g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
                        id, id_type, RC_TYPE_MMU_FAULT, mmfault);
        } else {
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 84d63b91..9c64675f 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -596,6 +596,9 @@ void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
        nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
                                faulted_pbdma, faulted_engine);
+        if (!refch)
+                return;
        if (gk20a_is_channel_marked_as_tsg(refch)) {
                tsg = &g->fifo.tsg[refch->tsgid];
                if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 694ff8ad..aed45ceb 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2093,23 +2093,30 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
        u32 warp_esr_error,
        struct channel_gk20a *fault_ch)
 {
-        struct tsg_gk20a *tsg;
        u32 offset;
+        int err = 0;
+        fault_ch = gk20a_channel_get(fault_ch);
        if (fault_ch) {
-                tsg = &g->fifo.tsg[fault_ch->tsgid];
+                if (!fault_ch->mmu_nack_handled) {
+                        /* recovery is not done for the channel implying mmu
-                /*
+                         * nack interrupt is serviced before mmu fault. Force
-                 * Upon receiving MMU_FAULT error, MMU will forward MMU_NACK
+                         * recovery by returning an error. Also indicate we
-                 * to SM. So MMU_FAULT handling path will take care of
+                         * should skip a second recovery.
-                 * triggering RC recovery
+                         */
-                 *
+                        fault_ch->mmu_nack_handled = true;
-                 * In MMU_NACK handling path, we just set the error notifier
+                        err = -EFAULT;
-                 * and clear the interrupt so that the User Space sees the error
+                }
-                 * as soon as semaphores are released by SM
-                 */
-                gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
        }
+        /* else mmu fault is serviced first and channel is closed */
+        /* do not release reference to ch as we do not want userspace to close
+         * this channel on recovery. Otherwise mmu fault handler will enter
+         * recovery path even if channel is invalid. We want to explicitly check
+         * for teardown value in mmu fault handler.
+         */
+        if (!err)
+                gk20a_channel_put(fault_ch);
        /* clear interrupt */
        offset = gk20a_gr_gpc_offset(g, gpc) +
@@ -2122,7 +2129,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
                        "ESR %s(0x%x)",
                        "MMU NACK ERROR",
                        warp_esr_error);
-        return 0;
+        return err;
 }
 static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
author	Aparna Das <aparnad@nvidia.com>	2018-06-04 22:40:56 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2018-06-15 20:47:06 -0400
commit	98d996f4ffb0137d119b5849cae46d7b7e5693e1 (patch)
tree	4e3243b91e8e559dd14b8bc30fc2468713182027
parent	1f51620fda57443c77506c354af837a60883d78b (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 551e8b04..4e6837ef 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -684,6 +684,8 @@ struct channel_gk20a gk20a_open_new_channel(struct gk20a g,
684	ch->subctx_id = 0;	684	ch->subctx_id = 0;
685	ch->runqueue_sel = 0;	685	ch->runqueue_sel = 0;
686		686
		687	ch->mmu_nack_handled = false;
		688
687	/* The channel is not runnable at this point. It still needs to have	689	/* The channel is not runnable at this point. It still needs to have
688	* an address space bound and allocate a gpfifo and grctx. */	690	* an address space bound and allocate a gpfifo and grctx. */
689		691


diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index f95184be..aa37db62 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -295,6 +295,8 @@ struct channel_gk20a {
295		295
296	/* Any operating system specific data. */	296	/* Any operating system specific data. */
297	void *os_priv;	297	void *os_priv;
		298
		299	bool mmu_nack_handled;
298	};	300	};
299		301
300	static inline struct channel_gk20a *	302	static inline struct channel_gk20a *


diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c index ce8f5669..bba7e66c 100644 --- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
@@ -935,12 +935,37 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
935	id = mmfault->chid;	935	id = mmfault->chid;
936	id_type = ID_TYPE_CHANNEL;	936	id_type = ID_TYPE_CHANNEL;
937	}	937	}
		938	if (mmfault->refch->mmu_nack_handled) {
		939	/* We have already recovered for the same
		940	* context, skip doing another recovery.
		941	*/
		942	mmfault->refch->mmu_nack_handled = false;
		943	/*
		944	* Recovery path can be entered twice for the
		945	* same error in case of mmu nack. If mmu
		946	* nack interrupt is handled before mmu fault
		947	* then channel reference is increased to avoid
		948	* closing the channel by userspace. Decrement
		949	* channel reference.
		950	*/
		951	gk20a_channel_put(mmfault->refch);
		952	/* refch in mmfault is assigned at the time
		953	* of copying fault info from snap reg or bar2
		954	* fault buf.
		955	*/
		956	gk20a_channel_put(mmfault->refch);
		957	return;
		958	}
938	} else {	959	} else {
939	id_type = ID_TYPE_UNKNOWN;	960	id_type = ID_TYPE_UNKNOWN;
940	}	961	}
941	if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)	962	if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
942	act_eng_bitmask = BIT(mmfault->faulted_engine);	963	act_eng_bitmask = BIT(mmfault->faulted_engine);
943		964
		965	/* Indicate recovery is handled if mmu fault is a result of
		966	* mmu nack.
		967	*/
		968	mmfault->refch->mmu_nack_handled = true;
944	g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,	969	g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
945	id, id_type, RC_TYPE_MMU_FAULT, mmfault);	970	id, id_type, RC_TYPE_MMU_FAULT, mmfault);
946	} else {	971	} else {


diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index 84d63b91..9c64675f 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -596,6 +596,9 @@ void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
596	nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",	596	nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
597	faulted_pbdma, faulted_engine);	597	faulted_pbdma, faulted_engine);
598		598
		599	if (!refch)
		600	return;
		601
599	if (gk20a_is_channel_marked_as_tsg(refch)) {	602	if (gk20a_is_channel_marked_as_tsg(refch)) {
600	tsg = &g->fifo.tsg[refch->tsgid];	603	tsg = &g->fifo.tsg[refch->tsgid];
601	if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)	604	if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)


diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 694ff8ad..aed45ceb 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2093,23 +2093,30 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2093	u32 warp_esr_error,	2093	u32 warp_esr_error,
2094	struct channel_gk20a *fault_ch)	2094	struct channel_gk20a *fault_ch)
2095	{	2095	{
2096	struct tsg_gk20a *tsg;
2097	u32 offset;	2096	u32 offset;
		2097	int err = 0;
2098		2098
		2099	fault_ch = gk20a_channel_get(fault_ch);
2099	if (fault_ch) {	2100	if (fault_ch) {
2100	tsg = &g->fifo.tsg[fault_ch->tsgid];	2101	if (!fault_ch->mmu_nack_handled) {
2101		2102	/* recovery is not done for the channel implying mmu
2102	/*	2103	* nack interrupt is serviced before mmu fault. Force
2103	* Upon receiving MMU_FAULT error, MMU will forward MMU_NACK	2104	* recovery by returning an error. Also indicate we
2104	* to SM. So MMU_FAULT handling path will take care of	2105	* should skip a second recovery.
2105	* triggering RC recovery	2106	*/
2106	*	2107	fault_ch->mmu_nack_handled = true;
2107	* In MMU_NACK handling path, we just set the error notifier	2108	err = -EFAULT;
2108	* and clear the interrupt so that the User Space sees the error	2109	}
2109	* as soon as semaphores are released by SM
2110	*/
2111	gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
2112	}	2110	}
		2111	/* else mmu fault is serviced first and channel is closed */
		2112
		2113	/* do not release reference to ch as we do not want userspace to close
		2114	* this channel on recovery. Otherwise mmu fault handler will enter
		2115	* recovery path even if channel is invalid. We want to explicitly check
		2116	* for teardown value in mmu fault handler.
		2117	*/
		2118	if (!err)
		2119	gk20a_channel_put(fault_ch);
2113		2120
2114	/* clear interrupt */	2121	/* clear interrupt */
2115	offset = gk20a_gr_gpc_offset(g, gpc) +	2122	offset = gk20a_gr_gpc_offset(g, gpc) +
@@ -2122,7 +2129,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2122	"ESR %s(0x%x)",	2129	"ESR %s(0x%x)",
2123	"MMU NACK ERROR",	2130	"MMU NACK ERROR",
2124	warp_esr_error);	2131	warp_esr_error);
2125	return 0;	2132	return err;
2126	}	2133	}
2127		2134
2128	static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)	2135	static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)