From ef69df6dae3dd21f10b035e687381a578344e417 Mon Sep 17 00:00:00 2001
From: Seema Khowala <seemaj@nvidia.com>
Date: Thu, 14 Feb 2019 13:36:19 -0800
Subject: gpu: nvgpu: add hal to mask/unmask intr during teardown

ctxsw timeout error prevents recovery as it can get triggered
periodically. Disable ctxsw timeout interrupt to allow recovery.

Bug 2092051
Bug 2429295
Bug 2484211
Bug 1890287

Change-Id: I47470e13968d8b26cdaf519b62fd510bc7ea05d9
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2019645
Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
(cherry picked from commit 68c13e2f0447118d7391807c9b9269749d09a4ec
in dev-kernel)
Reviewed-on: https://git-master.nvidia.com/r/2024899
GVS: Gerrit_Virtual_Submit
Reviewed-by: Bibek Basu <bbasu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c    | 39 +++++++++++++++++++--------------
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.h    |  2 ++
 drivers/gpu/nvgpu/gm20b/hal_gm20b.c     |  2 ++
 drivers/gpu/nvgpu/gp10b/hal_gp10b.c     |  2 ++
 drivers/gpu/nvgpu/gv100/fifo_gv100.c    | 20 ++++++++++++++++-
 drivers/gpu/nvgpu/gv100/fifo_gv100.h    |  4 +++-
 drivers/gpu/nvgpu/gv100/hal_gv100.c     |  2 ++
 drivers/gpu/nvgpu/gv11b/fifo_gv11b.c    | 31 ++++++++++++++++++++++++++
 drivers/gpu/nvgpu/gv11b/fifo_gv11b.h    |  2 ++
 drivers/gpu/nvgpu/gv11b/hal_gv11b.c     |  2 ++
 drivers/gpu/nvgpu/include/nvgpu/gk20a.h |  2 ++
 11 files changed, 90 insertions(+), 18 deletions(-)

(limited to 'drivers/gpu/nvgpu')

diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index a1844a28..d6f1cb3a 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1980,6 +1980,27 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
 	nvgpu_mutex_release(&g->dbg_sessions_lock);
 }
 
+void gk20a_fifo_teardown_mask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val &= ~(fifo_intr_en_0_sched_error_m() |
+		fifo_intr_en_0_mmu_fault_m());
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+	gk20a_writel(g, fifo_intr_0_r(), fifo_intr_0_sched_error_reset_f());
+}
+
+void gk20a_fifo_teardown_unmask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val |= fifo_intr_en_0_mmu_fault_f(1) | fifo_intr_en_0_sched_error_f(1);
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+
+}
+
 void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
 			u32 hw_id, unsigned int id_type, unsigned int rc_type,
 			 struct mmu_fault_info *mmfault)
@@ -1987,7 +2008,6 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
 	unsigned long engine_id, i;
 	unsigned long _engine_ids = __engine_ids;
 	unsigned long engine_ids = 0;
-	u32 val;
 	u32 mmu_fault_engines = 0;
 	u32 ref_type;
 	u32 ref_id;
@@ -2048,25 +2068,12 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
 	}
 
 	if (mmu_fault_engines) {
-		/*
-		 * sched error prevents recovery, and ctxsw error will retrigger
-		 * every 100ms. Disable the sched error to allow recovery.
-		 */
-		val = gk20a_readl(g, fifo_intr_en_0_r());
-		val &= ~(fifo_intr_en_0_sched_error_m() |
-			fifo_intr_en_0_mmu_fault_m());
-		gk20a_writel(g, fifo_intr_en_0_r(), val);
-		gk20a_writel(g, fifo_intr_0_r(),
-				fifo_intr_0_sched_error_reset_f());
-
+		g->ops.fifo.teardown_mask_intr(g);
 		g->ops.fifo.trigger_mmu_fault(g, engine_ids);
 		gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines, ref_id,
 				ref_id_is_tsg);
 
-		val = gk20a_readl(g, fifo_intr_en_0_r());
-		val |= fifo_intr_en_0_mmu_fault_f(1)
-			| fifo_intr_en_0_sched_error_f(1);
-		gk20a_writel(g, fifo_intr_en_0_r(), val);
+		g->ops.fifo.teardown_unmask_intr(g);
 	}
 
 	nvgpu_log_info(g, "release runlist_lock for all runlists");
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 29c2f889..0c9d9101 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -426,6 +426,8 @@ bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch,
 			bool *verbose, u32 *ms);
 bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg,
 			bool *verbose, u32 *ms);
+void gk20a_fifo_teardown_mask_intr(struct gk20a *g);
+void gk20a_fifo_teardown_unmask_intr(struct gk20a *g);
 bool gk20a_fifo_handle_sched_error(struct gk20a *g);
 
 void gk20a_fifo_reset_pbdma_method(struct gk20a *g, int pbdma_id,
diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
index dbfbc3d7..c5bf4ff5 100644
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -448,6 +448,8 @@ static const struct gpu_ops gm20b_ops = {
 		.init_pbdma_intr_descs = gm20b_fifo_init_pbdma_intr_descs,
 		.reset_enable_hw = gk20a_init_fifo_reset_enable_hw,
 		.teardown_ch_tsg = gk20a_fifo_teardown_ch_tsg,
+		.teardown_mask_intr = gk20a_fifo_teardown_mask_intr,
+		.teardown_unmask_intr = gk20a_fifo_teardown_unmask_intr,
 		.handle_sched_error = gk20a_fifo_handle_sched_error,
 		.handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0,
 		.handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1,
diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
index b3379253..5fcfb32f 100644
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -488,6 +488,8 @@ static const struct gpu_ops gp10b_ops = {
 		.init_pbdma_intr_descs = gp10b_fifo_init_pbdma_intr_descs,
 		.reset_enable_hw = gk20a_init_fifo_reset_enable_hw,
 		.teardown_ch_tsg = gk20a_fifo_teardown_ch_tsg,
+		.teardown_mask_intr = gk20a_fifo_teardown_mask_intr,
+		.teardown_unmask_intr = gk20a_fifo_teardown_unmask_intr,
 		.handle_sched_error = gk20a_fifo_handle_sched_error,
 		.handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0,
 		.handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1,
diff --git a/drivers/gpu/nvgpu/gv100/fifo_gv100.c b/drivers/gpu/nvgpu/gv100/fifo_gv100.c
index 0b5515f2..e23bc77b 100644
--- a/drivers/gpu/nvgpu/gv100/fifo_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/fifo_gv100.c
@@ -1,7 +1,7 @@
 /*
  * GV100 fifo
  *
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -55,3 +55,21 @@ void gv100_apply_ctxsw_timeout_intr(struct gk20a *g)
 	gk20a_writel(g, fifo_eng_timeout_r(), timeout);
 }
 
+void gv100_fifo_teardown_mask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val &= ~(fifo_intr_en_0_sched_error_m());
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+	gk20a_writel(g, fifo_intr_0_r(), fifo_intr_0_sched_error_reset_f());
+}
+
+void gv100_fifo_teardown_unmask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val |= fifo_intr_en_0_sched_error_f(1);
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+}
diff --git a/drivers/gpu/nvgpu/gv100/fifo_gv100.h b/drivers/gpu/nvgpu/gv100/fifo_gv100.h
index 0af3fcce..e9a89766 100644
--- a/drivers/gpu/nvgpu/gv100/fifo_gv100.h
+++ b/drivers/gpu/nvgpu/gv100/fifo_gv100.h
@@ -1,7 +1,7 @@
 /*
  * GV100 Fifo
  *
- * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -31,4 +31,6 @@ struct gk20a;
 u32 gv100_fifo_get_num_fifos(struct gk20a *g);
 u32 gv100_fifo_get_preempt_timeout(struct gk20a *g);
 void gv100_apply_ctxsw_timeout_intr(struct gk20a *g);
+void gv100_fifo_teardown_mask_intr(struct gk20a *g);
+void gv100_fifo_teardown_unmask_intr(struct gk20a *g);
 #endif
diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c
index 9621aaa4..c8849631 100644
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -639,6 +639,8 @@ static const struct gpu_ops gv100_ops = {
 		.init_pbdma_intr_descs = gv11b_fifo_init_pbdma_intr_descs,
 		.reset_enable_hw = gk20a_init_fifo_reset_enable_hw,
 		.teardown_ch_tsg = gv11b_fifo_teardown_ch_tsg,
+		.teardown_mask_intr = gv100_fifo_teardown_mask_intr,
+		.teardown_unmask_intr = gv100_fifo_teardown_unmask_intr,
 		.handle_sched_error = gk20a_fifo_handle_sched_error,
 		.handle_pbdma_intr_0 = gv11b_fifo_handle_pbdma_intr_0,
 		.handle_pbdma_intr_1 = gv11b_fifo_handle_pbdma_intr_1,
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index b5af2972..5b84df47 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -982,6 +982,33 @@ static void gv11b_fifo_locked_abort_runlist_active_tsgs(struct gk20a *g,
 	}
 }
 
+void gv11b_fifo_teardown_mask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	/*
+	 * ctxsw timeout error prevents recovery, and ctxsw error will retrigger
+	 * every 100ms. Disable ctxsw timeout error to allow recovery.
+	 */
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val &= ~ fifo_intr_0_ctxsw_timeout_pending_f();
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+	gk20a_writel(g, fifo_intr_ctxsw_timeout_r(),
+			gk20a_readl(g, fifo_intr_ctxsw_timeout_r()));
+
+}
+
+void gv11b_fifo_teardown_unmask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	/* enable ctxsw timeout interrupt */
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val |= fifo_intr_0_ctxsw_timeout_pending_f();
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+}
+
+
 void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 			u32 id, unsigned int id_type, unsigned int rc_type,
 			 struct mmu_fault_info *mmfault)
@@ -1001,6 +1028,8 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 			runlist_lock);
 	}
 
+	g->ops.fifo.teardown_mask_intr(g);
+
 	/* get runlist id and tsg */
 	if (id_type == ID_TYPE_TSG) {
 		if (id != FIFO_INVAL_TSG_ID) {
@@ -1195,6 +1224,8 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 		nvgpu_pmu_enable_elpg(g);
 	}
 
+	g->ops.fifo.teardown_unmask_intr(g);
+
 	/* release runlist_lock */
 	if (runlist_id != FIFO_INVAL_RUNLIST_ID) {
 		nvgpu_log_fn(g, "release runlist_lock runlist_id = %d",
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
index 3d491bad..bc6b0f1c 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
@@ -87,6 +87,8 @@ int gv11b_fifo_enable_tsg(struct tsg_gk20a *tsg);
 void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 			u32 id, unsigned int id_type, unsigned int rc_type,
 			 struct mmu_fault_info *mmfault);
+void gv11b_fifo_teardown_mask_intr(struct gk20a *g);
+void gv11b_fifo_teardown_unmask_intr(struct gk20a *g);
 void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f);
 int gv11b_init_fifo_reset_enable_hw(struct gk20a *g);
 bool gv11b_fifo_handle_sched_error(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
index d52d1c7e..718b6f93 100644
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -601,6 +601,8 @@ static const struct gpu_ops gv11b_ops = {
 		.init_pbdma_intr_descs = gv11b_fifo_init_pbdma_intr_descs,
 		.reset_enable_hw = gv11b_init_fifo_reset_enable_hw,
 		.teardown_ch_tsg = gv11b_fifo_teardown_ch_tsg,
+		.teardown_mask_intr = gv11b_fifo_teardown_mask_intr,
+		.teardown_unmask_intr = gv11b_fifo_teardown_unmask_intr,
 		.handle_sched_error = gv11b_fifo_handle_sched_error,
 		.handle_pbdma_intr_0 = gv11b_fifo_handle_pbdma_intr_0,
 		.handle_pbdma_intr_1 = gv11b_fifo_handle_pbdma_intr_1,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
index aa435638..81a4e7b8 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -718,6 +718,8 @@ struct gpu_ops {
 		void (*teardown_ch_tsg)(struct gk20a *g, u32 act_eng_bitmask,
 			u32 id, unsigned int id_type, unsigned int rc_type,
 			 struct mmu_fault_info *mmfault);
+		void (*teardown_mask_intr)(struct gk20a *g);
+		void (*teardown_unmask_intr)(struct gk20a *g);
 		bool (*handle_sched_error)(struct gk20a *g);
 		bool (*handle_ctxsw_timeout)(struct gk20a *g, u32 fifo_intr);
 		unsigned int (*handle_pbdma_intr_0)(struct gk20a *g,
-- 
cgit v1.2.2