gpu: nvgpu: Linux specific sm_error_state_record

Create an nvgpu internal nvgpu_gr_sm_error_state to store and propagate SM error state within driver. Use nvgpu_dbg_gpu_sm_error_state_record only in Linux code. JIRA NVGPU-259 Change-Id: I7365cdf5a1a42cbcdb418dfcef3e0020e02a960f Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1585645 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Terje Bergstrom <tbergstrom@nvidia.com> 2017-10-25 17:17:30 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-10-26 16:26:25 -0400
commit: 9eebb7831facaa16b2975f50a716d2986c67b699 (patch)
tree: 8c0f5ba76e76c10762a04ea7fd7b681960f8ed5b /drivers/gpu
parent: 34ce21a588ad3e6d11a8fa6bc5b9e7282dca8f61 (diff)
7 files changed, 48 insertions, 27 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c
index 7e62bb5c..403d9261 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c
@@ -239,7 +239,8 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
        struct gr_gk20a *gr = &g->gr;
        u32 sm_id;
        struct channel_gk20a *ch;
-        struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state;
+        struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
+        struct nvgpu_gr_sm_error_state sm_error_state;
        int err = 0;
        ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
@@ -250,41 +251,43 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
        if (sm_id >= gr->no_of_sm)
                return -EINVAL;
-        sm_error_state = nvgpu_kzalloc(g, sizeof(*sm_error_state));
-        if (!sm_error_state)
-                return -ENOMEM;
        if (args->sm_error_state_record_size > 0) {
-                size_t read_size = sizeof(*sm_error_state);
+                size_t read_size = sizeof(sm_error_state_record);
                if (read_size > args->sm_error_state_record_size)
                        read_size = args->sm_error_state_record_size;
                nvgpu_mutex_acquire(&g->dbg_sessions_lock);
-                err = copy_from_user(sm_error_state,
+                err = copy_from_user(&sm_error_state_record,
                          (void __user *)(uintptr_t)
                                args->sm_error_state_record_mem,
                          read_size);
                nvgpu_mutex_release(&g->dbg_sessions_lock);
-                if (err) {
+                if (err)
-                        err = -ENOMEM;
+                        return -ENOMEM;
-                        goto err_free;
-                }
        }
        err = gk20a_busy(g);
        if (err)
-                goto err_free;
+                return err;
+        sm_error_state.hww_global_esr =
+                sm_error_state_record.hww_global_esr;
+        sm_error_state.hww_warp_esr =
+                sm_error_state_record.hww_warp_esr;
+        sm_error_state.hww_warp_esr_pc =
+                sm_error_state_record.hww_warp_esr_pc;
+        sm_error_state.hww_global_esr_report_mask =
+                sm_error_state_record.hww_global_esr_report_mask;
+        sm_error_state.hww_warp_esr_report_mask =
+                sm_error_state_record.hww_warp_esr_report_mask;
        err = gr_gk20a_elpg_protected_call(g,
                        g->ops.gr.update_sm_error_state(g, ch,
-                                        sm_id, sm_error_state));
+                                        sm_id, &sm_error_state));
        gk20a_idle(g);
-err_free:
-        nvgpu_kfree(g, sm_error_state);
        return err;
 }
@@ -295,7 +298,8 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
 {
        struct gk20a *g = dbg_s->g;
        struct gr_gk20a *gr = &g->gr;
-        struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state;
+        struct nvgpu_gr_sm_error_state *sm_error_state;
+        struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
        u32 sm_id;
        int err = 0;
@@ -304,6 +308,16 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
                return -EINVAL;
        sm_error_state = gr->sm_error_states + sm_id;
+        sm_error_state_record.hww_global_esr =
+                sm_error_state->hww_global_esr;
+        sm_error_state_record.hww_warp_esr =
+                sm_error_state->hww_warp_esr;
+        sm_error_state_record.hww_warp_esr_pc =
+                sm_error_state->hww_warp_esr_pc;
+        sm_error_state_record.hww_global_esr_report_mask =
+                sm_error_state->hww_global_esr_report_mask;
+        sm_error_state_record.hww_warp_esr_report_mask =
+                sm_error_state->hww_warp_esr_report_mask;
        if (args->sm_error_state_record_size > 0) {
                size_t write_size = sizeof(*sm_error_state);
@@ -314,7 +328,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
                nvgpu_mutex_acquire(&g->dbg_sessions_lock);
                err = copy_to_user((void __user *)(uintptr_t)
                                                args->sm_error_state_record_mem,
-                                   sm_error_state,
+                                   &sm_error_state_record,
                                   write_size);
                nvgpu_mutex_release(&g->dbg_sessions_lock);
                if (err) {
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 13d534c4..80d85d65 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -361,8 +361,7 @@ struct gpu_ops {
                                u32 gpc, u32 tpc);
                int (*update_sm_error_state)(struct gk20a *g,
                                struct channel_gk20a *ch, u32 sm_id,
-                                struct nvgpu_dbg_gpu_sm_error_state_record *
+                                struct nvgpu_gr_sm_error_state *sm_error_state);
-                                                                sm_error_state);
                int (*clear_sm_error_state)(struct gk20a *g,
                                struct channel_gk20a *ch, u32 sm_id);
                int (*suspend_contexts)(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 5910c7d9..2fd6f72c 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -1543,7 +1543,7 @@ restore_fe_go_idle:
         * we initialize gr->no_of_sm in this function
         */
        gr->sm_error_states = nvgpu_kzalloc(g,
-                        sizeof(struct nvgpu_dbg_gpu_sm_error_state_record)
+                        sizeof(struct nvgpu_gr_sm_error_state)
                        * gr->no_of_sm);
        if (!gr->sm_error_states) {
                err = -ENOMEM;
@@ -4566,7 +4566,7 @@ restore_fe_go_idle:
         * we initialize gr->no_of_sm in this function
         */
        gr->sm_error_states = nvgpu_kzalloc(g,
-                        sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) *
+                        sizeof(struct nvgpu_gr_sm_error_state) *
                        gr->no_of_sm);
        if (!gr->sm_error_states) {
                err = -ENOMEM;
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 6b422138..22fc40d1 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -231,6 +231,14 @@ struct nvgpu_preemption_modes_rec {
        u32 default_compute_preempt_mode; /* default mode */
 };
+struct nvgpu_gr_sm_error_state {
+        u32 hww_global_esr;
+        u32 hww_warp_esr;
+        u64 hww_warp_esr_pc;
+        u32 hww_global_esr_report_mask;
+        u32 hww_warp_esr_report_mask;
+};
 struct gr_gk20a {
        struct gk20a *g;
        struct {
@@ -387,7 +395,7 @@ struct gr_gk20a {
        u32 *fbp_rop_l2_en_mask;
        u32 no_of_sm;
        struct sm_info *sm_to_cluster;
-        struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_states;
+        struct nvgpu_gr_sm_error_state *sm_error_states;
 #if defined(CONFIG_GK20A_CYCLE_STATS)
        struct nvgpu_mutex                      cs_lock;
        struct gk20a_cs_snapshot        *cs_data;
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index a1078b10..c10517b7 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -1297,7 +1297,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc)
 int gm20b_gr_update_sm_error_state(struct gk20a *g,
                struct channel_gk20a *ch, u32 sm_id,
-                struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state)
+                struct nvgpu_gr_sm_error_state *sm_error_state)
 {
        u32 gpc, tpc, offset;
        struct gr_gk20a *gr = &g->gr;
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
index 67f1ea29..15deaa0d 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
@@ -119,7 +119,7 @@ void gr_gm20b_get_access_map(struct gk20a *g,
 int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc);
 int gm20b_gr_update_sm_error_state(struct gk20a *g,
                struct channel_gk20a *ch, u32 sm_id,
-                struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state);
+                struct nvgpu_gr_sm_error_state *sm_error_state);
 int gm20b_gr_clear_sm_error_state(struct gk20a *g,
                struct channel_gk20a *ch, u32 sm_id);
 int gr_gm20b_get_preemption_mode_flags(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
index 2d6beda6..d400f08e 100644
--- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
@@ -899,7 +899,7 @@ static int vgpu_gr_init_gr_setup_sw(struct gk20a *g)
        nvgpu_mutex_init(&gr->ctx_mutex);
        gr->sm_error_states = nvgpu_kzalloc(g,
-                        sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) *
+                        sizeof(struct nvgpu_gr_sm_error_state) *
                        gr->no_of_sm);
        if (!gr->sm_error_states) {
                err = -ENOMEM;
@@ -1195,7 +1195,7 @@ int vgpu_gr_resume_contexts(struct gk20a *g,
 void vgpu_gr_handle_sm_esr_event(struct gk20a *g,
                        struct tegra_vgpu_sm_esr_info *info)
 {
-        struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_states;
+        struct nvgpu_gr_sm_error_state *sm_error_states;
        if (info->sm_id >= g->gr.no_of_sm) {
                nvgpu_err(g, "invalid smd_id %d / %d",
author	Terje Bergstrom <tbergstrom@nvidia.com>	2017-10-25 17:17:30 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-10-26 16:26:25 -0400
commit	9eebb7831facaa16b2975f50a716d2986c67b699 (patch)
tree	8c0f5ba76e76c10762a04ea7fd7b681960f8ed5b /drivers/gpu
parent	34ce21a588ad3e6d11a8fa6bc5b9e7282dca8f61 (diff)