From bfe65407bde2b5d0776724301e215c6553c989f3 Mon Sep 17 00:00:00 2001
From: Vinod G <vinodg@nvidia.com>
Date: Tue, 7 Aug 2018 23:09:30 -0700
Subject: gpu: nvgpu: Read sm error ioctl support for tsg

Add READ_SM_ERROR IOCTL support to TSG level.
Moved the struct to save the sm_error details
from gr to tsg as the sm_error support is context
based, not global.

Also corrected MISRA 21.1 error in header file.

nvgpu_dbg_gpu_ioctl_write_single_sm_error_state and
nvgpu_dbg_gpu_ioctl_read_single_sm_error_state
functions are modified to use the tsg struct
nvgpu_tsg_sm_error_state.

Bug 200412642

Change-Id: I9e334b059078a4bb0e360b945444cc4bf1cc56ec
Signed-off-by: Vinod G <vinodg@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1794856
Reviewed-by: svc-misra-checker <svc-misra-checker@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/gk20a.h     |  2 +-
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c  | 30 --------------
 drivers/gpu/nvgpu/gk20a/gr_gk20a.h  |  9 ----
 drivers/gpu/nvgpu/gk20a/tsg_gk20a.c | 82 ++++++++++++++++++++++++++++++++++---
 drivers/gpu/nvgpu/gk20a/tsg_gk20a.h | 21 ++++++++--
 5 files changed, 95 insertions(+), 49 deletions(-)

(limited to 'drivers/gpu/nvgpu/gk20a')

diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index cf202f14..192f4c3e 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -396,7 +396,7 @@ struct gpu_ops {
 				u32 sm, struct channel_gk20a *fault_ch);
 		int (*update_sm_error_state)(struct gk20a *g,
 				struct channel_gk20a *ch, u32 sm_id,
-				struct nvgpu_gr_sm_error_state *sm_error_state);
+				struct nvgpu_tsg_sm_error_state *sm_error_state);
 		int (*clear_sm_error_state)(struct gk20a *g,
 				struct channel_gk20a *ch, u32 sm_id);
 		int (*suspend_contexts)(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index f2b083d7..cdc00bbd 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -1561,19 +1561,6 @@ restore_fe_go_idle:
 	if (err)
 		goto clean_up;
 
-	nvgpu_kfree(g, gr->sm_error_states);
-
-	/* we need to allocate this after g->ops.gr.init_fs_state() since
-	 * we initialize gr->no_of_sm in this function
-	 */
-	gr->sm_error_states = nvgpu_kzalloc(g,
-			sizeof(struct nvgpu_gr_sm_error_state)
-			* gr->no_of_sm);
-	if (!gr->sm_error_states) {
-		err = -ENOMEM;
-		goto restore_fe_go_idle;
-	}
-
 	ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
 	ctx_header_words >>= 2;
 
@@ -3072,7 +3059,6 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr)
 
 	memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
 
-	nvgpu_kfree(g, gr->sm_error_states);
 	nvgpu_kfree(g, gr->gpc_tpc_count);
 	nvgpu_kfree(g, gr->gpc_zcb_count);
 	nvgpu_kfree(g, gr->gpc_ppc_count);
@@ -4545,22 +4531,6 @@ restore_fe_go_idle:
 
 	err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
 				 GR_IDLE_CHECK_DEFAULT);
-	if (err)
-		goto out;
-
-	nvgpu_kfree(g, gr->sm_error_states);
-
-	/* we need to allocate this after g->ops.gr.init_fs_state() since
-	 * we initialize gr->no_of_sm in this function
-	 */
-	gr->sm_error_states = nvgpu_kzalloc(g,
-			sizeof(struct nvgpu_gr_sm_error_state) *
-			gr->no_of_sm);
-	if (!gr->sm_error_states) {
-		err = -ENOMEM;
-		goto restore_fe_go_idle;
-	}
-
 out:
 	nvgpu_log_fn(g, "done");
 	return err;
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 3fc7e55f..bd5e625d 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -254,14 +254,6 @@ struct nvgpu_preemption_modes_rec {
 	u32 default_compute_preempt_mode; /* default mode */
 };
 
-struct nvgpu_gr_sm_error_state {
-	u32 hww_global_esr;
-	u32 hww_warp_esr;
-	u64 hww_warp_esr_pc;
-	u32 hww_global_esr_report_mask;
-	u32 hww_warp_esr_report_mask;
-};
-
 struct gr_gk20a {
 	struct gk20a *g;
 	struct {
@@ -427,7 +419,6 @@ struct gr_gk20a {
 	u32 *fbp_rop_l2_en_mask;
 	u32 no_of_sm;
 	struct sm_info *sm_to_cluster;
-	struct nvgpu_gr_sm_error_state *sm_error_states;
 
 #define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE		(0x0U)
 #define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL		(0x1U << 0)
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
index 62763da3..624ee1d7 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
@@ -275,8 +275,23 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
 	int err;
 
 	tsg = gk20a_tsg_acquire_unused_tsg(&g->fifo);
-	if (!tsg)
+	if (tsg == NULL) {
 		return NULL;
+	}
+
+	/* we need to allocate this after g->ops.gr.init_fs_state() since
+	 * we initialize gr->no_of_sm in this function
+	 */
+	if (g->gr.no_of_sm == 0U) {
+		nvgpu_err(g, "no_of_sm %d not set, failed allocation",
+				  g->gr.no_of_sm);
+		return NULL;
+	}
+
+	err = gk20a_tsg_alloc_sm_error_states_mem(g, tsg, g->gr.no_of_sm);
+	if (err != 0) {
+		return NULL;
+	}
 
 	tsg->g = g;
 	tsg->num_active_channels = 0;
@@ -295,7 +310,7 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
 
 	if (g->ops.fifo.tsg_open) {
 		err = g->ops.fifo.tsg_open(tsg);
-		if (err) {
+		if (err != 0) {
 			nvgpu_err(g, "tsg %d fifo open failed %d",
 				  tsg->tsgid, err);
 			goto clean_up;
@@ -307,6 +322,12 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
 	return tsg;
 
 clean_up:
+
+	if(tsg->sm_error_states != NULL) {
+		nvgpu_kfree(g, tsg->sm_error_states);
+		tsg->sm_error_states = NULL;
+	}
+
 	nvgpu_ref_put(&tsg->refcount, gk20a_tsg_release);
 	return NULL;
 }
@@ -317,20 +338,28 @@ void gk20a_tsg_release(struct nvgpu_ref *ref)
 	struct gk20a *g = tsg->g;
 	struct gk20a_event_id_data *event_id_data, *event_id_data_temp;
 
-	if (g->ops.fifo.tsg_release)
+	if (g->ops.fifo.tsg_release != NULL) {
 		g->ops.fifo.tsg_release(tsg);
+	}
 
-	if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem))
+	if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem)) {
 		gr_gk20a_free_tsg_gr_ctx(tsg);
+	}
 
-	if (g->ops.fifo.deinit_eng_method_buffers)
+	if (g->ops.fifo.deinit_eng_method_buffers != NULL) {
 		g->ops.fifo.deinit_eng_method_buffers(g, tsg);
+	}
 
-	if (tsg->vm) {
+	if (tsg->vm != NULL) {
 		nvgpu_vm_put(tsg->vm);
 		tsg->vm = NULL;
 	}
 
+	if(tsg->sm_error_states != NULL) {
+		nvgpu_kfree(g, tsg->sm_error_states);
+		tsg->sm_error_states = NULL;
+	}
+
 	/* unhook all events created on this TSG */
 	nvgpu_mutex_acquire(&tsg->event_id_list_lock);
 	nvgpu_list_for_each_entry_safe(event_id_data, event_id_data_temp,
@@ -360,3 +389,44 @@ struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch)
 
 	return tsg;
 }
+
+int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g,
+					struct tsg_gk20a *tsg,
+					u32 num_sm)
+{
+	int err = 0;
+
+	if (tsg->sm_error_states != NULL) {
+		return err;
+	}
+
+	tsg->sm_error_states = nvgpu_kzalloc(g,
+			sizeof(struct nvgpu_tsg_sm_error_state)
+			* num_sm);
+	if (tsg->sm_error_states == NULL) {
+		nvgpu_err(g, "sm_error_states mem allocation failed");
+		err = -ENOMEM;
+	}
+
+	return err;
+}
+
+void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg,
+				u32 sm_id,
+				struct nvgpu_tsg_sm_error_state *sm_error_state)
+{
+	struct nvgpu_tsg_sm_error_state *tsg_sm_error_states;
+
+	tsg_sm_error_states = tsg->sm_error_states + sm_id;
+
+	tsg_sm_error_states->hww_global_esr =
+			sm_error_state->hww_global_esr;
+	tsg_sm_error_states->hww_warp_esr =
+			sm_error_state->hww_warp_esr;
+	tsg_sm_error_states->hww_warp_esr_pc =
+			sm_error_state->hww_warp_esr_pc;
+	tsg_sm_error_states->hww_global_esr_report_mask =
+			sm_error_state->hww_global_esr_report_mask;
+	tsg_sm_error_states->hww_warp_esr_report_mask =
+			sm_error_state->hww_warp_esr_report_mask;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
index 552c3bb3..67ccb9f5 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
@@ -19,8 +19,8 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
-#ifndef __TSG_GK20A_H_
-#define __TSG_GK20A_H_
+#ifndef TSG_GK20A_H
+#define TSG_GK20A_H
 
 #include <nvgpu/lock.h>
 #include <nvgpu/kref.h>
@@ -39,6 +39,14 @@ void gk20a_tsg_release(struct nvgpu_ref *ref);
 int gk20a_init_tsg_support(struct gk20a *g, u32 tsgid);
 struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch);
 
+struct nvgpu_tsg_sm_error_state {
+	u32 hww_global_esr;
+	u32 hww_warp_esr;
+	u64 hww_warp_esr_pc;
+	u32 hww_global_esr_report_mask;
+	u32 hww_warp_esr_report_mask;
+};
+
 struct tsg_gk20a {
 	struct gk20a *g;
 
@@ -69,6 +77,7 @@ struct tsg_gk20a {
 	bool tpc_num_initialized;
 	bool in_use;
 
+	struct nvgpu_tsg_sm_error_state *sm_error_states;
 };
 
 int gk20a_enable_tsg(struct tsg_gk20a *tsg);
@@ -84,6 +93,12 @@ int gk20a_tsg_set_timeslice(struct tsg_gk20a *tsg, u32 timeslice);
 u32 gk20a_tsg_get_timeslice(struct tsg_gk20a *tsg);
 int gk20a_tsg_set_priority(struct gk20a *g, struct tsg_gk20a *tsg,
 				u32 priority);
+int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g,
+					struct tsg_gk20a *tsg,
+					u32 num_sm);
+void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg,
+			u32 sm_id,
+			struct nvgpu_tsg_sm_error_state *sm_error_state);
 
 struct gk20a_event_id_data {
 	struct gk20a *g;
@@ -106,4 +121,4 @@ gk20a_event_id_data_from_event_id_node(struct nvgpu_list_node *node)
 		((uintptr_t)node - offsetof(struct gk20a_event_id_data, event_id_node));
 };
 
-#endif /* __TSG_GK20A_H_ */
+#endif /* TSG_GK20A_H */
-- 
cgit v1.2.2