From a09b9cd587d27a3ef6479a17631c3497d447e7a9 Mon Sep 17 00:00:00 2001
From: Vinod G <vinodg@nvidia.com>
Date: Thu, 24 May 2018 14:00:19 -0700
Subject: gpu: nvgpu: Add IOCTL for SM_EXCEPTION_TYPE_MASK

Add new ioctl to set the SM_EXCEPTION_TYPE_MASK is
added to dbg session.
Currently support SM_EXCEPTION_TYPE_MASK_FATAL type
If this type is set then the code will skip RC recovery,
instead trigger CILP preemption.

bug  200412641
JIRA NVGPU-702

Change-Id: I4b1f18379ee792cd324ccc555939e0f4f5c9e3b4
Signed-off-by: Vinod G <vinodg@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1729792
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h |  6 +++
 drivers/gpu/nvgpu/gk20a/gr_gk20a.h      |  6 +++
 drivers/gpu/nvgpu/gv11b/gr_gv11b.c      | 22 +++++++++--
 drivers/gpu/nvgpu/os/linux/ioctl_dbg.c  | 68 +++++++++++++++++++++++++++++++++
 include/uapi/linux/nvgpu.h              | 21 +++++++++-
 5 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
index 50002557..4d3c4d74 100644
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
@@ -72,6 +72,12 @@ struct dbg_session_gk20a {
 	bool broadcast_stop_trigger;
 
 	struct nvgpu_mutex ioctl_lock;
+
+	/*
+	 * sm set exception type mask flag, to check whether
+	 * exception type mask is requested or not.
+	 */
+	bool is_sm_exception_type_mask_set;
 };
 
 struct dbg_session_data {
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 0c6be57b..804e0e25 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -437,6 +437,12 @@ struct gr_gk20a {
 	u32 no_of_sm;
 	struct sm_info *sm_to_cluster;
 	struct nvgpu_gr_sm_error_state *sm_error_states;
+
+#define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE		(0x0U)
+#define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL		(0x1U << 0)
+	u32 sm_exception_mask_type;
+	u32 sm_exception_mask_refcount;
+
 #if defined(CONFIG_GK20A_CYCLE_STATS)
 	struct nvgpu_mutex			cs_lock;
 	struct gk20a_cs_snapshot	*cs_data;
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index c925e5b6..9e36071f 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2182,9 +2182,9 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
 
 	struct warp_esr_error_table_s warp_esr_error_table[] = {
 		{ gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_error_f(),
-			"STACK ERROR"},
+				"STACK ERROR"},
 		{ gr_gpc0_tpc0_sm0_hww_warp_esr_error_api_stack_error_f(),
-			"API STACK ERROR"},
+				"API STACK ERROR"},
 		{ gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_wrap_f(),
 				"PC WRAP ERROR"},
 		{ gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_pc_f(),
@@ -2221,7 +2221,7 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
 		if (warp_esr_error_table[index].error_value == warp_esr_error) {
 			esr_err = warp_esr_error_table[index].error_value;
 			nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
-				"ESR %s(0x%x)",
+				"WARP_ESR %s(0x%x)",
 				warp_esr_error_table[index].error_name,
 				esr_err);
 			break;
@@ -2250,6 +2250,21 @@ static int gr_gv11b_handle_all_warp_esr_errors(struct gk20a *g,
 		return 0;
 	}
 
+	/*
+	 * Check SET_EXCEPTION_TYPE_MASK is being set.
+	 * If set, skip the recovery and trigger CILP
+	 * If not set, trigger the recovery.
+	 */
+	if ((g->gr.sm_exception_mask_type &
+					NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) ==
+					NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) {
+		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
+			"SM Exception Type Mask set %d,"
+			"skip recovery",
+			g->gr.sm_exception_mask_type);
+		return 0;
+	}
+
 	if (fault_ch) {
 		tsg = &g->fifo.tsg[fault_ch->tsgid];
 
@@ -2294,7 +2309,6 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g,
 	u32 warp_esr_error = gr_gpc0_tpc0_sm0_hww_warp_esr_error_v(warp_esr);
 	struct tsg_gk20a *tsg;
 
-
 	*early_exit = false;
 	*ignore_debugger = false;
 
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
index eadf1f93..ad4dfc0e 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
@@ -151,6 +151,10 @@ static int dbg_unbind_all_channels_gk20a(struct dbg_session_gk20a *dbg_s);
 static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
 		struct file *filp, bool is_profiler);
 
+static int nvgpu_set_sm_exception_type_mask_locked(
+					struct dbg_session_gk20a *dbg_s,
+					u32 exception_mask);
+
 unsigned int gk20a_dbg_gpu_dev_poll(struct file *filep, poll_table *wait)
 {
 	unsigned int mask = 0;
@@ -217,6 +221,10 @@ int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
 			nvgpu_kfree(g, prof_obj);
 		}
 	}
+
+	nvgpu_set_sm_exception_type_mask_locked(dbg_s,
+					NVGPU_SM_EXCEPTION_TYPE_MASK_NONE);
+
 	nvgpu_mutex_release(&g->dbg_sessions_lock);
 
 	nvgpu_mutex_destroy(&dbg_s->ch_list_lock);
@@ -466,6 +474,7 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
 	dbg_s->is_profiler = is_profiler;
 	dbg_s->is_pg_disabled = false;
 	dbg_s->is_timeout_disabled = false;
+	dbg_s->is_sm_exception_type_mask_set = false;
 
 	nvgpu_cond_init(&dbg_s->dbg_events.wait_queue);
 	nvgpu_init_list_node(&dbg_s->ch_list);
@@ -478,6 +487,9 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
 	dbg_s->dbg_events.events_enabled = false;
 	dbg_s->dbg_events.num_pending_events = 0;
 
+	nvgpu_set_sm_exception_type_mask_locked(dbg_s,
+					NVGPU_SM_EXCEPTION_TYPE_MASK_NONE);
+
 	return 0;
 
 err_destroy_lock:
@@ -1839,6 +1851,57 @@ out:
 	return err;
 }
 
+static int nvgpu_set_sm_exception_type_mask_locked(
+					struct dbg_session_gk20a *dbg_s,
+					u32 exception_mask)
+{
+	struct gk20a *g = dbg_s->g;
+	struct gr_gk20a *gr = &g->gr;
+	int err = 0;
+
+	switch (exception_mask) {
+	case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL:
+		gr->sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL;
+		if (dbg_s->is_sm_exception_type_mask_set == false) {
+			gr->sm_exception_mask_refcount++;
+			dbg_s->is_sm_exception_type_mask_set = true;
+		}
+		break;
+	case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE:
+		if (dbg_s->is_sm_exception_type_mask_set) {
+			gr->sm_exception_mask_refcount--;
+			dbg_s->is_sm_exception_type_mask_set = false;
+		}
+		if (gr->sm_exception_mask_refcount == 0)
+			gr->sm_exception_mask_type =
+					NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;
+		break;
+	default:
+		nvgpu_err(g,
+			   "unrecognized dbg sm exception type mask: 0x%x",
+			   exception_mask);
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int nvgpu_dbg_gpu_set_sm_exception_type_mask(
+		struct dbg_session_gk20a *dbg_s,
+		struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args *args)
+{
+	int err = 0;
+	struct gk20a *g = dbg_s->g;
+
+	nvgpu_mutex_acquire(&g->dbg_sessions_lock);
+	err = nvgpu_set_sm_exception_type_mask_locked(dbg_s,
+					args->exception_type_mask);
+	nvgpu_mutex_release(&g->dbg_sessions_lock);
+
+	return err;
+}
+
 int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp)
 {
 	struct nvgpu_os_linux *l = container_of(inode->i_cdev,
@@ -1994,6 +2057,11 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
 			   (struct nvgpu_dbg_gpu_profiler_reserve_args *)buf);
 		break;
 
+	case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK:
+		err = nvgpu_dbg_gpu_set_sm_exception_type_mask(dbg_s,
+		   (struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args *)buf);
+		break;
+
 	default:
 		nvgpu_err(g,
 			   "unrecognized dbg gpu ioctl cmd: 0x%x",
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 446f5bd3..0733a7b2 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -1411,8 +1411,27 @@ struct nvgpu_dbg_gpu_profiler_reserve_args {
 #define NVGPU_DBG_GPU_IOCTL_PROFILER_RESERVE			\
 	_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 22, struct nvgpu_dbg_gpu_profiler_reserve_args)
 
+/*
+ * This struct helps to set the exception mask. If mask is not set
+ * or set to NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE
+ * then kernel code will follow recovery path on sm exception.
+ * If mask is set to NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL, then
+ * kernel code will skip recovery path on sm exception.
+ */
+struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args {
+#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE	(0x0U)
+#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL	(0x1U << 0U)
+	/* exception type mask value */
+	__u32 exception_type_mask;
+	__u32 reserved;
+};
+
+#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK \
+	_IOW(NVGPU_DBG_GPU_IOCTL_MAGIC, 23, \
+			struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args)
+
 #define NVGPU_DBG_GPU_IOCTL_LAST		\
-	_IOC_NR(NVGPU_DBG_GPU_IOCTL_PROFILER_RESERVE)
+	_IOC_NR(NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK)
 
 #define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE		\
 	sizeof(struct nvgpu_dbg_gpu_access_fb_memory_args)
-- 
cgit v1.2.2