gpu: nvgpu: recover on first interrupt reported for mmu nack

In case of mmu nack error interrupt is received twice through SM reported mmu nack interrupt and mmu fault in undertermined order. Recover on the first received interrupt to avoid semaphore release and skip doing a second recovery. Also fix NULL pointer dereference in function gv11b_fifo_reset_pbdma_and_eng_faulted when channel reference is invalid in teardown path. Bug 200382235 Change-Id: I361a5725d7b6355ebf02b2870727f647fbd7a37e Signed-off-by: Aparna Das <aparnad@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1739804 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Aparna Das <aparnad@nvidia.com> 2018-06-04 22:40:56 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2018-06-15 20:47:06 -0400
commit: 98d996f4ffb0137d119b5849cae46d7b7e5693e1 (patch)
tree: 4e3243b91e8e559dd14b8bc30fc2468713182027 /drivers/gpu/nvgpu/gv11b/gr_gv11b.c
parent: 1f51620fda57443c77506c354af837a60883d78b (diff)
1 files changed, 21 insertions, 14 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 694ff8ad..aed45ceb 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2093,23 +2093,30 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
        u32 warp_esr_error,
        struct channel_gk20a *fault_ch)
 {
-        struct tsg_gk20a *tsg;
        u32 offset;
+        int err = 0;
+        fault_ch = gk20a_channel_get(fault_ch);
        if (fault_ch) {
-                tsg = &g->fifo.tsg[fault_ch->tsgid];
+                if (!fault_ch->mmu_nack_handled) {
+                        /* recovery is not done for the channel implying mmu
-                /*
+                         * nack interrupt is serviced before mmu fault. Force
-                 * Upon receiving MMU_FAULT error, MMU will forward MMU_NACK
+                         * recovery by returning an error. Also indicate we
-                 * to SM. So MMU_FAULT handling path will take care of
+                         * should skip a second recovery.
-                 * triggering RC recovery
+                         */
-                 *
+                        fault_ch->mmu_nack_handled = true;
-                 * In MMU_NACK handling path, we just set the error notifier
+                        err = -EFAULT;
-                 * and clear the interrupt so that the User Space sees the error
+                }
-                 * as soon as semaphores are released by SM
-                 */
-                gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
        }
+        /* else mmu fault is serviced first and channel is closed */
+        /* do not release reference to ch as we do not want userspace to close
+         * this channel on recovery. Otherwise mmu fault handler will enter
+         * recovery path even if channel is invalid. We want to explicitly check
+         * for teardown value in mmu fault handler.
+         */
+        if (!err)
+                gk20a_channel_put(fault_ch);
        /* clear interrupt */
        offset = gk20a_gr_gpc_offset(g, gpc) +
@@ -2122,7 +2129,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
                        "ESR %s(0x%x)",
                        "MMU NACK ERROR",
                        warp_esr_error);
-        return 0;
+        return err;
 }
 static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
author	Aparna Das <aparnad@nvidia.com>	2018-06-04 22:40:56 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2018-06-15 20:47:06 -0400
commit	98d996f4ffb0137d119b5849cae46d7b7e5693e1 (patch)
tree	4e3243b91e8e559dd14b8bc30fc2468713182027 /drivers/gpu/nvgpu/gv11b/gr_gv11b.c
parent	1f51620fda57443c77506c354af837a60883d78b (diff)

diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 694ff8ad..aed45ceb 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2093,23 +2093,30 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2093	u32 warp_esr_error,	2093	u32 warp_esr_error,
2094	struct channel_gk20a *fault_ch)	2094	struct channel_gk20a *fault_ch)
2095	{	2095	{
2096	struct tsg_gk20a *tsg;
2097	u32 offset;	2096	u32 offset;
		2097	int err = 0;
2098		2098
		2099	fault_ch = gk20a_channel_get(fault_ch);
2099	if (fault_ch) {	2100	if (fault_ch) {
2100	tsg = &g->fifo.tsg[fault_ch->tsgid];	2101	if (!fault_ch->mmu_nack_handled) {
2101		2102	/* recovery is not done for the channel implying mmu
2102	/*	2103	* nack interrupt is serviced before mmu fault. Force
2103	* Upon receiving MMU_FAULT error, MMU will forward MMU_NACK	2104	* recovery by returning an error. Also indicate we
2104	* to SM. So MMU_FAULT handling path will take care of	2105	* should skip a second recovery.
2105	* triggering RC recovery	2106	*/
2106	*	2107	fault_ch->mmu_nack_handled = true;
2107	* In MMU_NACK handling path, we just set the error notifier	2108	err = -EFAULT;
2108	* and clear the interrupt so that the User Space sees the error	2109	}
2109	* as soon as semaphores are released by SM
2110	*/
2111	gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
2112	}	2110	}
		2111	/* else mmu fault is serviced first and channel is closed */
		2112
		2113	/* do not release reference to ch as we do not want userspace to close
		2114	* this channel on recovery. Otherwise mmu fault handler will enter
		2115	* recovery path even if channel is invalid. We want to explicitly check
		2116	* for teardown value in mmu fault handler.
		2117	*/
		2118	if (!err)
		2119	gk20a_channel_put(fault_ch);
2113		2120
2114	/* clear interrupt */	2121	/* clear interrupt */
2115	offset = gk20a_gr_gpc_offset(g, gpc) +	2122	offset = gk20a_gr_gpc_offset(g, gpc) +
@@ -2122,7 +2129,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2122	"ESR %s(0x%x)",	2129	"ESR %s(0x%x)",
2123	"MMU NACK ERROR",	2130	"MMU NACK ERROR",
2124	warp_esr_error);	2131	warp_esr_error);
2125	return 0;	2132	return err;
2126	}	2133	}
2127		2134
2128	static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)	2135	static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)