gpu: nvgpu: gv11b: add runlist abort & remove bare channel

-Add support for aborting runlist/s. Aborting runlist/s, will abort all active tsgs and associated active channels within these active tsgs -Bare channels are no longer supported. Remove recovery support for bare channels. In case there are bare channels, recovery will trigger runlist abort Bug 2125776 Bug 2108544 Bug 2105322 Bug 2092051 Bug 2048824 Bug 2043838 Bug 2039587 Bug 2028993 Bug 2029245 Bug 2065990 Bug 1945121 Bug 200401707 Bug 200393631 Bug 200327596 Change-Id: I6bec8a0004508cf65ea128bf641a26bf4c2f236d Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1640567 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Seema Khowala <seemaj@nvidia.com> 2018-02-23 16:00:00 -0500
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2018-06-24 12:53:44 -0400
commit: cd6e821cf66837a2c3479e928414007064b9c496 (patch)
tree: 18e5cfde24246342b05e8431ba8b816de9ec407b /drivers/gpu/nvgpu/gv11b/fb_gv11b.c
parent: 5cf1eb145fef763f7153e449be60f1a7602e2c81 (diff)
1 files changed, 52 insertions, 27 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
index 54f0d2d8..2ceb816b 100644
--- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
@@ -870,10 +870,11 @@ static void gv11b_fb_copy_from_hw_fault_buf(struct gk20a *g,
 static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                 struct mmu_fault_info *mmfault, u32 *invalidate_replay_val)
 {
-        unsigned int id_type;
+        unsigned int id_type = ID_TYPE_UNKNOWN;
        u32 num_lce, act_eng_bitmask = 0;
        int err = 0;
-        u32 id = ((u32)~0);
+        u32 id = FIFO_INVAL_TSG_ID;
+        unsigned int rc_type = RC_TYPE_NO_RC;
        if (!mmfault->valid)
                return;
@@ -888,18 +889,23 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                /* CE page faults are not reported as replayable */
                nvgpu_log(g, gpu_dbg_intr, "CE Faulted");
                err = gv11b_fb_fix_page_fault(g, mmfault);
-                gv11b_fifo_reset_pbdma_and_eng_faulted(g, mmfault->refch,
+                if (mmfault->refch &&
-                        mmfault->faulted_pbdma, mmfault->faulted_engine);
+                        (u32)mmfault->refch->tsgid != FIFO_INVAL_TSG_ID) {
+                        gv11b_fifo_reset_pbdma_and_eng_faulted(g,
+                                &g->fifo.tsg[mmfault->refch->tsgid],
+                                mmfault->faulted_pbdma,
+                                mmfault->faulted_engine);
+                }
                if (!err) {
                        nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Fixed");
                        *invalidate_replay_val = 0;
-                        /* refch in mmfault is assigned at the time of copying
+                        if (mmfault->refch) {
-                         * fault info from snap reg or bar2 fault buf
+                                gk20a_channel_put(mmfault->refch);
-                         */
+                                mmfault->refch = NULL;
-                        gk20a_channel_put(mmfault->refch);
+                        }
                        return;
                }
-                /* Do recovery. Channel recovery needs refch */
+                /* Do recovery */
                nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Not Fixed");
        }
@@ -911,16 +917,9 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                 * instance block, the fault cannot be isolated to a
                 * single context so we need to reset the entire runlist
                 */
-                id_type = ID_TYPE_UNKNOWN;
+                        rc_type = RC_TYPE_MMU_FAULT;
                } else if (mmfault->refch) {
-                        if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) {
-                                id = mmfault->refch->tsgid;
-                                id_type = ID_TYPE_TSG;
-                        } else {
-                                id = mmfault->chid;
-                                id_type = ID_TYPE_CHANNEL;
-                        }
                        if (mmfault->refch->mmu_nack_handled) {
                                /* We have already recovered for the same
                                 * context, skip doing another recovery.
@@ -941,19 +940,40 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                                 */
                                gk20a_channel_put(mmfault->refch);
                                return;
+                        } else {
+                                /* Indicate recovery is handled if mmu fault is
+                                 * a result of mmu nack.
+                                 */
+                                mmfault->refch->mmu_nack_handled = true;
+                        }
+                        rc_type = RC_TYPE_MMU_FAULT;
+                        if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) {
+                                id = mmfault->refch->tsgid;
+                                if (id != FIFO_INVAL_TSG_ID)
+                                        id_type = ID_TYPE_TSG;
+                        } else {
+                                nvgpu_err(g, "bare channels not supported");
                        }
-                } else {
-                        id_type = ID_TYPE_UNKNOWN;
                }
-                if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
+                /* engine is faulted */
+                if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID) {
                        act_eng_bitmask = BIT(mmfault->faulted_engine);
+                        rc_type = RC_TYPE_MMU_FAULT;
+                }
-                /* Indicate recovery is handled if mmu fault is a result of
+                /* refch in mmfault is assigned at the time of copying
-                 * mmu nack.
+                 * fault info from snap reg or bar2 fault buf
                 */
-                mmfault->refch->mmu_nack_handled = true;
+                if (mmfault->refch) {
-                g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
+                        gk20a_channel_put(mmfault->refch);
-                        id, id_type, RC_TYPE_MMU_FAULT, mmfault);
+                        mmfault->refch = NULL;
+                }
+                if (rc_type != RC_TYPE_NO_RC)
+                        g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
+                                id, id_type, rc_type, mmfault);
        } else {
                if (mmfault->fault_type == gmmu_fault_type_pte_v()) {
                        nvgpu_log(g, gpu_dbg_intr, "invalid pte! try to fix");
@@ -972,7 +992,10 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                /* refch in mmfault is assigned at the time of copying
                 * fault info from snap reg or bar2 fault buf
                 */
-                gk20a_channel_put(mmfault->refch);
+                if (mmfault->refch) {
+                        gk20a_channel_put(mmfault->refch);
+                        mmfault->refch = NULL;
+                }
        }
 }
@@ -1061,8 +1084,10 @@ void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g,
                        next_fault_addr = mmfault->fault_addr;
                        if (prev_fault_addr == next_fault_addr) {
                                nvgpu_log(g, gpu_dbg_intr, "pte already scanned");
-                                if (mmfault->refch)
+                                if (mmfault->refch) {
                                        gk20a_channel_put(mmfault->refch);
+                                        mmfault->refch = NULL;
+                                }
                                continue;
                        }
                }
author	Seema Khowala <seemaj@nvidia.com>	2018-02-23 16:00:00 -0500
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2018-06-24 12:53:44 -0400
commit	cd6e821cf66837a2c3479e928414007064b9c496 (patch)
tree	18e5cfde24246342b05e8431ba8b816de9ec407b /drivers/gpu/nvgpu/gv11b/fb_gv11b.c
parent	5cf1eb145fef763f7153e449be60f1a7602e2c81 (diff)

diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c index 54f0d2d8..2ceb816b 100644 --- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
@@ -870,10 +870,11 @@ static void gv11b_fb_copy_from_hw_fault_buf(struct gk20a *g,
870	static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,	870	static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
871	struct mmu_fault_info mmfault, u32 invalidate_replay_val)	871	struct mmu_fault_info mmfault, u32 invalidate_replay_val)
872	{	872	{
873	unsigned int id_type;	873	unsigned int id_type = ID_TYPE_UNKNOWN;
874	u32 num_lce, act_eng_bitmask = 0;	874	u32 num_lce, act_eng_bitmask = 0;
875	int err = 0;	875	int err = 0;
876	u32 id = ((u32)~0);	876	u32 id = FIFO_INVAL_TSG_ID;
		877	unsigned int rc_type = RC_TYPE_NO_RC;
877		878
878	if (!mmfault->valid)	879	if (!mmfault->valid)
879	return;	880	return;
@@ -888,18 +889,23 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
888	/* CE page faults are not reported as replayable */	889	/* CE page faults are not reported as replayable */
889	nvgpu_log(g, gpu_dbg_intr, "CE Faulted");	890	nvgpu_log(g, gpu_dbg_intr, "CE Faulted");
890	err = gv11b_fb_fix_page_fault(g, mmfault);	891	err = gv11b_fb_fix_page_fault(g, mmfault);
891	gv11b_fifo_reset_pbdma_and_eng_faulted(g, mmfault->refch,	892	if (mmfault->refch &&
892	mmfault->faulted_pbdma, mmfault->faulted_engine);	893	(u32)mmfault->refch->tsgid != FIFO_INVAL_TSG_ID) {
		894	gv11b_fifo_reset_pbdma_and_eng_faulted(g,
		895	&g->fifo.tsg[mmfault->refch->tsgid],
		896	mmfault->faulted_pbdma,
		897	mmfault->faulted_engine);
		898	}
893	if (!err) {	899	if (!err) {
894	nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Fixed");	900	nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Fixed");
895	*invalidate_replay_val = 0;	901	*invalidate_replay_val = 0;
896	/* refch in mmfault is assigned at the time of copying	902	if (mmfault->refch) {
897	* fault info from snap reg or bar2 fault buf	903	gk20a_channel_put(mmfault->refch);
898	*/	904	mmfault->refch = NULL;
899	gk20a_channel_put(mmfault->refch);	905	}
900	return;	906	return;
901	}	907	}
902	/* Do recovery. Channel recovery needs refch */	908	/* Do recovery */
903	nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Not Fixed");	909	nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Not Fixed");
904	}	910	}
905		911
@@ -911,16 +917,9 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
911	* instance block, the fault cannot be isolated to a	917	* instance block, the fault cannot be isolated to a
912	* single context so we need to reset the entire runlist	918	* single context so we need to reset the entire runlist
913	*/	919	*/
914	id_type = ID_TYPE_UNKNOWN;	920	rc_type = RC_TYPE_MMU_FAULT;
915		921
916	} else if (mmfault->refch) {	922	} else if (mmfault->refch) {
917	if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) {
918	id = mmfault->refch->tsgid;
919	id_type = ID_TYPE_TSG;
920	} else {
921	id = mmfault->chid;
922	id_type = ID_TYPE_CHANNEL;
923	}
924	if (mmfault->refch->mmu_nack_handled) {	923	if (mmfault->refch->mmu_nack_handled) {
925	/* We have already recovered for the same	924	/* We have already recovered for the same
926	* context, skip doing another recovery.	925	* context, skip doing another recovery.
@@ -941,19 +940,40 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
941	*/	940	*/
942	gk20a_channel_put(mmfault->refch);	941	gk20a_channel_put(mmfault->refch);
943	return;	942	return;
		943	} else {
		944	/* Indicate recovery is handled if mmu fault is
		945	* a result of mmu nack.
		946	*/
		947	mmfault->refch->mmu_nack_handled = true;
		948	}
		949
		950	rc_type = RC_TYPE_MMU_FAULT;
		951	if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) {
		952	id = mmfault->refch->tsgid;
		953	if (id != FIFO_INVAL_TSG_ID)
		954	id_type = ID_TYPE_TSG;
		955	} else {
		956	nvgpu_err(g, "bare channels not supported");
944	}	957	}
945	} else {
946	id_type = ID_TYPE_UNKNOWN;
947	}	958	}
948	if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)	959
		960	/* engine is faulted */
		961	if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID) {
949	act_eng_bitmask = BIT(mmfault->faulted_engine);	962	act_eng_bitmask = BIT(mmfault->faulted_engine);
		963	rc_type = RC_TYPE_MMU_FAULT;
		964	}
950		965
951	/* Indicate recovery is handled if mmu fault is a result of	966	/* refch in mmfault is assigned at the time of copying
952	* mmu nack.	967	* fault info from snap reg or bar2 fault buf
953	*/	968	*/
954	mmfault->refch->mmu_nack_handled = true;	969	if (mmfault->refch) {
955	g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,	970	gk20a_channel_put(mmfault->refch);
956	id, id_type, RC_TYPE_MMU_FAULT, mmfault);	971	mmfault->refch = NULL;
		972	}
		973
		974	if (rc_type != RC_TYPE_NO_RC)
		975	g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
		976	id, id_type, rc_type, mmfault);
957	} else {	977	} else {
958	if (mmfault->fault_type == gmmu_fault_type_pte_v()) {	978	if (mmfault->fault_type == gmmu_fault_type_pte_v()) {
959	nvgpu_log(g, gpu_dbg_intr, "invalid pte! try to fix");	979	nvgpu_log(g, gpu_dbg_intr, "invalid pte! try to fix");
@@ -972,7 +992,10 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
972	/* refch in mmfault is assigned at the time of copying	992	/* refch in mmfault is assigned at the time of copying
973	* fault info from snap reg or bar2 fault buf	993	* fault info from snap reg or bar2 fault buf
974	*/	994	*/
975	gk20a_channel_put(mmfault->refch);	995	if (mmfault->refch) {
		996	gk20a_channel_put(mmfault->refch);
		997	mmfault->refch = NULL;
		998	}
976	}	999	}
977	}	1000	}
978		1001
@@ -1061,8 +1084,10 @@ void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g,
1061	next_fault_addr = mmfault->fault_addr;	1084	next_fault_addr = mmfault->fault_addr;
1062	if (prev_fault_addr == next_fault_addr) {	1085	if (prev_fault_addr == next_fault_addr) {
1063	nvgpu_log(g, gpu_dbg_intr, "pte already scanned");	1086	nvgpu_log(g, gpu_dbg_intr, "pte already scanned");
1064	if (mmfault->refch)	1087	if (mmfault->refch) {
1065	gk20a_channel_put(mmfault->refch);	1088	gk20a_channel_put(mmfault->refch);
		1089	mmfault->refch = NULL;
		1090	}
1066	continue;	1091	continue;
1067	}	1092	}
1068	}	1093	}