gpu: nvgpu: Read sm error ioctl support for tsg

Add READ_SM_ERROR IOCTL support to TSG level. Moved the struct to save the sm_error details from gr to tsg as the sm_error support is context based, not global. Also corrected MISRA 21.1 error in header file. nvgpu_dbg_gpu_ioctl_write_single_sm_error_state and nvgpu_dbg_gpu_ioctl_read_single_sm_error_state functions are modified to use the tsg struct nvgpu_tsg_sm_error_state. Bug 200412642 Change-Id: I9e334b059078a4bb0e360b945444cc4bf1cc56ec Signed-off-by: Vinod G <vinodg@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1794856 Reviewed-by: svc-misra-checker <svc-misra-checker@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Vinod G <vinodg@nvidia.com> 2018-08-08 02:09:30 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2018-08-25 05:10:43 -0400
commit: bfe65407bde2b5d0776724301e215c6553c989f3 (patch)
tree: f68a01361052afe1c30a0c6dcd5d359b762e647a
parent: 3bd47da0954d3486d9ccd3c396f84445918f82b4 (diff)
15 files changed, 386 insertions, 257 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index cf202f14..192f4c3e 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -396,7 +396,7 @@ struct gpu_ops {
                                u32 sm, struct channel_gk20a *fault_ch);
                int (*update_sm_error_state)(struct gk20a *g,
                                struct channel_gk20a *ch, u32 sm_id,
-                                struct nvgpu_gr_sm_error_state *sm_error_state);
+                                struct nvgpu_tsg_sm_error_state *sm_error_state);
                int (*clear_sm_error_state)(struct gk20a *g,
                                struct channel_gk20a *ch, u32 sm_id);
                int (*suspend_contexts)(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index f2b083d7..cdc00bbd 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -1561,19 +1561,6 @@ restore_fe_go_idle:
        if (err)
                goto clean_up;
-        nvgpu_kfree(g, gr->sm_error_states);
-        /* we need to allocate this after g->ops.gr.init_fs_state() since
-         * we initialize gr->no_of_sm in this function
-         */
-        gr->sm_error_states = nvgpu_kzalloc(g,
-                        sizeof(struct nvgpu_gr_sm_error_state)
-                        * gr->no_of_sm);
-        if (!gr->sm_error_states) {
-                err = -ENOMEM;
-                goto restore_fe_go_idle;
-        }
        ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
        ctx_header_words >>= 2;
@@ -3072,7 +3059,6 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr)
        memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
-        nvgpu_kfree(g, gr->sm_error_states);
        nvgpu_kfree(g, gr->gpc_tpc_count);
        nvgpu_kfree(g, gr->gpc_zcb_count);
        nvgpu_kfree(g, gr->gpc_ppc_count);
@@ -4545,22 +4531,6 @@ restore_fe_go_idle:
        err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
                                 GR_IDLE_CHECK_DEFAULT);
-        if (err)
-                goto out;
-        nvgpu_kfree(g, gr->sm_error_states);
-        /* we need to allocate this after g->ops.gr.init_fs_state() since
-         * we initialize gr->no_of_sm in this function
-         */
-        gr->sm_error_states = nvgpu_kzalloc(g,
-                        sizeof(struct nvgpu_gr_sm_error_state) *
-                        gr->no_of_sm);
-        if (!gr->sm_error_states) {
-                err = -ENOMEM;
-                goto restore_fe_go_idle;
-        }
 out:
        nvgpu_log_fn(g, "done");
        return err;
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 3fc7e55f..bd5e625d 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -254,14 +254,6 @@ struct nvgpu_preemption_modes_rec {
        u32 default_compute_preempt_mode; /* default mode */
 };
-struct nvgpu_gr_sm_error_state {
-        u32 hww_global_esr;
-        u32 hww_warp_esr;
-        u64 hww_warp_esr_pc;
-        u32 hww_global_esr_report_mask;
-        u32 hww_warp_esr_report_mask;
-};
 struct gr_gk20a {
        struct gk20a *g;
        struct {
@@ -427,7 +419,6 @@ struct gr_gk20a {
        u32 *fbp_rop_l2_en_mask;
        u32 no_of_sm;
        struct sm_info *sm_to_cluster;
-        struct nvgpu_gr_sm_error_state *sm_error_states;
 #define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE               (0x0U)
 #define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL              (0x1U << 0)
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
index 62763da3..624ee1d7 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
@@ -275,8 +275,23 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
        int err;
        tsg = gk20a_tsg_acquire_unused_tsg(&g->fifo);
-        if (!tsg)
+        if (tsg == NULL) {
                return NULL;
+        }
+        /* we need to allocate this after g->ops.gr.init_fs_state() since
+         * we initialize gr->no_of_sm in this function
+         */
+        if (g->gr.no_of_sm == 0U) {
+                nvgpu_err(g, "no_of_sm %d not set, failed allocation",
+                                  g->gr.no_of_sm);
+                return NULL;
+        }
+        err = gk20a_tsg_alloc_sm_error_states_mem(g, tsg, g->gr.no_of_sm);
+        if (err != 0) {
+                return NULL;
+        }
        tsg->g = g;
        tsg->num_active_channels = 0;
@@ -295,7 +310,7 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
        if (g->ops.fifo.tsg_open) {
                err = g->ops.fifo.tsg_open(tsg);
-                if (err) {
+                if (err != 0) {
                        nvgpu_err(g, "tsg %d fifo open failed %d",
                                  tsg->tsgid, err);
                        goto clean_up;
@@ -307,6 +322,12 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
        return tsg;
 clean_up:
+        if(tsg->sm_error_states != NULL) {
+                nvgpu_kfree(g, tsg->sm_error_states);
+                tsg->sm_error_states = NULL;
+        }
        nvgpu_ref_put(&tsg->refcount, gk20a_tsg_release);
        return NULL;
 }
@@ -317,20 +338,28 @@ void gk20a_tsg_release(struct nvgpu_ref *ref)
        struct gk20a *g = tsg->g;
        struct gk20a_event_id_data *event_id_data, *event_id_data_temp;
-        if (g->ops.fifo.tsg_release)
+        if (g->ops.fifo.tsg_release != NULL) {
                g->ops.fifo.tsg_release(tsg);
+        }
-        if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem))
+        if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem)) {
                gr_gk20a_free_tsg_gr_ctx(tsg);
+        }
-        if (g->ops.fifo.deinit_eng_method_buffers)
+        if (g->ops.fifo.deinit_eng_method_buffers != NULL) {
                g->ops.fifo.deinit_eng_method_buffers(g, tsg);
+        }
-        if (tsg->vm) {
+        if (tsg->vm != NULL) {
                nvgpu_vm_put(tsg->vm);
                tsg->vm = NULL;
        }
+        if(tsg->sm_error_states != NULL) {
+                nvgpu_kfree(g, tsg->sm_error_states);
+                tsg->sm_error_states = NULL;
+        }
        /* unhook all events created on this TSG */
        nvgpu_mutex_acquire(&tsg->event_id_list_lock);
        nvgpu_list_for_each_entry_safe(event_id_data, event_id_data_temp,
@@ -360,3 +389,44 @@ struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch)
        return tsg;
 }
+int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g,
+                                        struct tsg_gk20a *tsg,
+                                        u32 num_sm)
+{
+        int err = 0;
+        if (tsg->sm_error_states != NULL) {
+                return err;
+        }
+        tsg->sm_error_states = nvgpu_kzalloc(g,
+                        sizeof(struct nvgpu_tsg_sm_error_state)
+                        * num_sm);
+        if (tsg->sm_error_states == NULL) {
+                nvgpu_err(g, "sm_error_states mem allocation failed");
+                err = -ENOMEM;
+        }
+        return err;
+}
+void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg,
+                                u32 sm_id,
+                                struct nvgpu_tsg_sm_error_state *sm_error_state)
+{
+        struct nvgpu_tsg_sm_error_state *tsg_sm_error_states;
+        tsg_sm_error_states = tsg->sm_error_states + sm_id;
+        tsg_sm_error_states->hww_global_esr =
+                        sm_error_state->hww_global_esr;
+        tsg_sm_error_states->hww_warp_esr =
+                        sm_error_state->hww_warp_esr;
+        tsg_sm_error_states->hww_warp_esr_pc =
+                        sm_error_state->hww_warp_esr_pc;
+        tsg_sm_error_states->hww_global_esr_report_mask =
+                        sm_error_state->hww_global_esr_report_mask;
+        tsg_sm_error_states->hww_warp_esr_report_mask =
+                        sm_error_state->hww_warp_esr_report_mask;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
index 552c3bb3..67ccb9f5 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
@@ -19,8 +19,8 @@
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
-#ifndef __TSG_GK20A_H_
+#ifndef TSG_GK20A_H
-#define __TSG_GK20A_H_
+#define TSG_GK20A_H
 #include <nvgpu/lock.h>
 #include <nvgpu/kref.h>
@@ -39,6 +39,14 @@ void gk20a_tsg_release(struct nvgpu_ref *ref);
 int gk20a_init_tsg_support(struct gk20a *g, u32 tsgid);
 struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch);
+struct nvgpu_tsg_sm_error_state {
+        u32 hww_global_esr;
+        u32 hww_warp_esr;
+        u64 hww_warp_esr_pc;
+        u32 hww_global_esr_report_mask;
+        u32 hww_warp_esr_report_mask;
+};
 struct tsg_gk20a {
        struct gk20a *g;
@@ -69,6 +77,7 @@ struct tsg_gk20a {
        bool tpc_num_initialized;
        bool in_use;
+        struct nvgpu_tsg_sm_error_state *sm_error_states;
 };
 int gk20a_enable_tsg(struct tsg_gk20a *tsg);
@@ -84,6 +93,12 @@ int gk20a_tsg_set_timeslice(struct tsg_gk20a *tsg, u32 timeslice);
 u32 gk20a_tsg_get_timeslice(struct tsg_gk20a *tsg);
 int gk20a_tsg_set_priority(struct gk20a *g, struct tsg_gk20a *tsg,
                                u32 priority);
+int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g,
+                                        struct tsg_gk20a *tsg,
+                                        u32 num_sm);
+void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg,
+                        u32 sm_id,
+                        struct nvgpu_tsg_sm_error_state *sm_error_state);
 struct gk20a_event_id_data {
        struct gk20a *g;
@@ -106,4 +121,4 @@ gk20a_event_id_data_from_event_id_node(struct nvgpu_list_node *node)
                ((uintptr_t)node - offsetof(struct gk20a_event_id_data, event_id_node));
 };
-#endif /* __TSG_GK20A_H_ */
+#endif /* TSG_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index 68ae91e8..fc4ab3dd 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -1268,32 +1268,68 @@ void gr_gm20b_get_access_map(struct gk20a *g,
        *num_entries = ARRAY_SIZE(wl_addr_gm20b);
 }
+static void gm20b_gr_read_sm_error_state(struct gk20a *g,
+                        u32 offset,
+                        struct nvgpu_tsg_sm_error_state *sm_error_states)
+{
+        sm_error_states->hww_global_esr = gk20a_readl(g,
+                        gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
+        sm_error_states->hww_warp_esr = gk20a_readl(g,
+                        gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
+        sm_error_states->hww_warp_esr_pc = (u64)(gk20a_readl(g,
+                        gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset));
+        sm_error_states->hww_global_esr_report_mask = gk20a_readl(g,
+                       gr_gpc0_tpc0_sm_hww_global_esr_report_mask_r() + offset);
+        sm_error_states->hww_warp_esr_report_mask = gk20a_readl(g,
+                        gr_gpc0_tpc0_sm_hww_warp_esr_report_mask_r() + offset);
+}
+static void gm20b_gr_write_sm_error_state(struct gk20a *g,
+                        u32 offset,
+                        struct nvgpu_tsg_sm_error_state *sm_error_states)
+{
+        gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
+                                sm_error_states->hww_global_esr);
+        gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
+                                sm_error_states->hww_warp_esr);
+        gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset,
+                                u64_lo32(sm_error_states->hww_warp_esr_pc));
+        gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
+                                sm_error_states->hww_global_esr_report_mask);
+        gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
+                                sm_error_states->hww_warp_esr_report_mask);
+}
 int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
                                struct channel_gk20a *fault_ch)
 {
        int sm_id;
-        struct gr_gk20a *gr = &g->gr;
        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
                                               GPU_LIT_TPC_IN_GPC_STRIDE);
        u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
+        struct nvgpu_tsg_sm_error_state *sm_error_states = NULL;
+        struct tsg_gk20a *tsg = NULL;
        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
        sm_id = gr_gpc0_tpc0_sm_cfg_sm_id_v(gk20a_readl(g,
                        gr_gpc0_tpc0_sm_cfg_r() + offset));
-        gr->sm_error_states[sm_id].hww_global_esr = gk20a_readl(g,
+        if (fault_ch != NULL) {
-                        gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
+                tsg = tsg_gk20a_from_ch(fault_ch);
-        gr->sm_error_states[sm_id].hww_warp_esr = gk20a_readl(g,
+        }
-                        gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
-        gr->sm_error_states[sm_id].hww_warp_esr_pc = gk20a_readl(g,
+        if (tsg == NULL) {
-                        gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset);
+                nvgpu_err(g, "no valid tsg");
-        gr->sm_error_states[sm_id].hww_global_esr_report_mask = gk20a_readl(g,
+                goto record_fail;
-                       gr_gpc0_tpc0_sm_hww_global_esr_report_mask_r() + offset);
+        }
-        gr->sm_error_states[sm_id].hww_warp_esr_report_mask = gk20a_readl(g,
-                        gr_gpc0_tpc0_sm_hww_warp_esr_report_mask_r() + offset);
+        sm_error_states = tsg->sm_error_states + sm_id;
+        gm20b_gr_read_sm_error_state(g, offset, sm_error_states);
+record_fail:
        nvgpu_mutex_release(&g->dbg_sessions_lock);
        return sm_id;
@@ -1301,12 +1337,12 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
 int gm20b_gr_update_sm_error_state(struct gk20a *g,
                struct channel_gk20a *ch, u32 sm_id,
-                struct nvgpu_gr_sm_error_state *sm_error_state)
+                struct nvgpu_tsg_sm_error_state *sm_error_state)
 {
        u32 gpc, tpc, offset;
-        struct gr_gk20a *gr = &g->gr;
        struct tsg_gk20a *tsg;
        struct nvgpu_gr_ctx *ch_ctx;
+        struct nvgpu_tsg_sm_error_state *tsg_sm_error_states;
        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
                                               GPU_LIT_TPC_IN_GPC_STRIDE);
@@ -1320,16 +1356,8 @@ int gm20b_gr_update_sm_error_state(struct gk20a *g,
        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
-        gr->sm_error_states[sm_id].hww_global_esr =
+        tsg_sm_error_states = tsg->sm_error_states + sm_id;
-                        sm_error_state->hww_global_esr;
+        gk20a_tsg_update_sm_error_state_locked(tsg, sm_id, sm_error_state);
-        gr->sm_error_states[sm_id].hww_warp_esr =
-                        sm_error_state->hww_warp_esr;
-        gr->sm_error_states[sm_id].hww_warp_esr_pc =
-                        sm_error_state->hww_warp_esr_pc;
-        gr->sm_error_states[sm_id].hww_global_esr_report_mask =
-                        sm_error_state->hww_global_esr_report_mask;
-        gr->sm_error_states[sm_id].hww_warp_esr_report_mask =
-                        sm_error_state->hww_warp_esr_report_mask;
        err = gr_gk20a_disable_ctxsw(g);
        if (err) {
@@ -1343,29 +1371,20 @@ int gm20b_gr_update_sm_error_state(struct gk20a *g,
        offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
        if (gk20a_is_channel_ctx_resident(ch)) {
-                gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
+                gm20b_gr_write_sm_error_state(g, offset, tsg_sm_error_states);
-                                gr->sm_error_states[sm_id].hww_global_esr);
-                gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
-                                gr->sm_error_states[sm_id].hww_warp_esr);
-                gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset,
-                                gr->sm_error_states[sm_id].hww_warp_esr_pc);
-                gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
-                                gr->sm_error_states[sm_id].hww_global_esr_report_mask);
-                gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
-                                gr->sm_error_states[sm_id].hww_warp_esr_report_mask);
        } else {
                err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false);
                if (err)
                        goto enable_ctxsw;
                gr_gk20a_ctx_patch_write(g, ch_ctx,
-                                gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
-                                gr->sm_error_states[sm_id].hww_global_esr_report_mask,
+                        tsg_sm_error_states->hww_global_esr_report_mask,
-                                true);
+                        true);
                gr_gk20a_ctx_patch_write(g, ch_ctx,
-                                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
-                                gr->sm_error_states[sm_id].hww_warp_esr_report_mask,
+                        tsg_sm_error_states->hww_warp_esr_report_mask,
-                                true);
+                        true);
                gr_gk20a_ctx_patch_write_end(g, ch_ctx, false);
        }
@@ -1383,15 +1402,20 @@ int gm20b_gr_clear_sm_error_state(struct gk20a *g,
 {
        u32 gpc, tpc, offset;
        u32 val;
-        struct gr_gk20a *gr = &g->gr;
+        struct tsg_gk20a *tsg;
        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
                                               GPU_LIT_TPC_IN_GPC_STRIDE);
        int err = 0;
+        tsg = tsg_gk20a_from_ch(ch);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
-        memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states));
+        memset(&tsg->sm_error_states[sm_id], 0, sizeof(*tsg->sm_error_states));
        err = gr_gk20a_disable_ctxsw(g);
        if (err) {
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
index 9d8e5cdf..7c3baa59 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
@@ -119,7 +119,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc,
                u32 tpc, u32 sm, struct channel_gk20a *fault_ch);
 int gm20b_gr_update_sm_error_state(struct gk20a *g,
                struct channel_gk20a *ch, u32 sm_id,
-                struct nvgpu_gr_sm_error_state *sm_error_state);
+                struct nvgpu_tsg_sm_error_state *sm_error_state);
 int gm20b_gr_clear_sm_error_state(struct gk20a *g,
                struct channel_gk20a *ch, u32 sm_id);
 int gr_gm20b_get_preemption_mode_flags(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 1e001824..bc659a7b 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -3212,18 +3212,42 @@ void gv11b_gr_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state)
        }
 }
+static void gv11b_gr_write_sm_error_state(struct gk20a *g,
+                        u32 offset,
+                        struct nvgpu_tsg_sm_error_state *sm_error_states)
+{
+        nvgpu_writel(g,
+                gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset,
+                sm_error_states->hww_global_esr);
+        nvgpu_writel(g,
+                gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset,
+                sm_error_states->hww_warp_esr);
+        nvgpu_writel(g,
+                gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset,
+                u64_lo32(sm_error_states->hww_warp_esr_pc));
+        nvgpu_writel(g,
+                gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r() + offset,
+                u64_hi32(sm_error_states->hww_warp_esr_pc));
+        nvgpu_writel(g,
+                gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset,
+                sm_error_states->hww_global_esr_report_mask);
+        nvgpu_writel(g,
+                gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset,
+                sm_error_states->hww_warp_esr_report_mask);
+}
 int gv11b_gr_update_sm_error_state(struct gk20a *g,
                struct channel_gk20a *ch, u32 sm_id,
-                struct nvgpu_gr_sm_error_state *sm_error_state)
+                struct nvgpu_tsg_sm_error_state *sm_error_state)
 {
        struct tsg_gk20a *tsg;
        u32 gpc, tpc, sm, offset;
-        struct gr_gk20a *gr = &g->gr;
        struct nvgpu_gr_ctx *ch_ctx;
        int err = 0;
+        struct nvgpu_tsg_sm_error_state *tsg_sm_error_states;
        tsg = tsg_gk20a_from_ch(ch);
-        if (!tsg) {
+        if (tsg == NULL) {
                return -EINVAL;
        }
@@ -3231,16 +3255,8 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
-        gr->sm_error_states[sm_id].hww_global_esr =
+        tsg_sm_error_states = tsg->sm_error_states + sm_id;
-                        sm_error_state->hww_global_esr;
+        gk20a_tsg_update_sm_error_state_locked(tsg, sm_id, sm_error_state);
-        gr->sm_error_states[sm_id].hww_warp_esr =
-                        sm_error_state->hww_warp_esr;
-        gr->sm_error_states[sm_id].hww_warp_esr_pc =
-                        sm_error_state->hww_warp_esr_pc;
-        gr->sm_error_states[sm_id].hww_global_esr_report_mask =
-                        sm_error_state->hww_global_esr_report_mask;
-        gr->sm_error_states[sm_id].hww_warp_esr_report_mask =
-                        sm_error_state->hww_warp_esr_report_mask;
        err = gr_gk20a_disable_ctxsw(g);
        if (err) {
@@ -3257,21 +3273,7 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
                         gv11b_gr_sm_offset(g, sm);
        if (gk20a_is_channel_ctx_resident(ch)) {
-                gk20a_writel(g,
+                gv11b_gr_write_sm_error_state(g, offset, tsg_sm_error_states);
-                        gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset,
-                        gr->sm_error_states[sm_id].hww_global_esr);
-                gk20a_writel(g,
-                        gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset,
-                        gr->sm_error_states[sm_id].hww_warp_esr);
-                gk20a_writel(g,
-                        gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset,
-                        gr->sm_error_states[sm_id].hww_warp_esr_pc);
-                gk20a_writel(g,
-                        gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset,
-                        gr->sm_error_states[sm_id].hww_global_esr_report_mask);
-                gk20a_writel(g,
-                        gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset,
-                        gr->sm_error_states[sm_id].hww_warp_esr_report_mask);
        } else {
                err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false);
                if (err) {
@@ -3281,12 +3283,12 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
                gr_gk20a_ctx_patch_write(g, ch_ctx,
                        gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r() +
                        offset,
-                        gr->sm_error_states[sm_id].hww_global_esr_report_mask,
+                        tsg_sm_error_states->hww_global_esr_report_mask,
                        true);
                gr_gk20a_ctx_patch_write(g, ch_ctx,
                        gr_gpcs_tpcs_sms_hww_warp_esr_report_mask_r() +
                        offset,
-                        gr->sm_error_states[sm_id].hww_warp_esr_report_mask,
+                        tsg_sm_error_states->hww_warp_esr_report_mask,
                        true);
                gr_gk20a_ctx_patch_write_end(g, ch_ctx, false);
@@ -3362,13 +3364,36 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
        return err;
 }
+static void gv11b_gr_read_sm_error_state(struct gk20a *g,
+                        u32 offset,
+                        struct nvgpu_tsg_sm_error_state *sm_error_states)
+{
+        sm_error_states->hww_global_esr = nvgpu_readl(g,
+                gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset);
+        sm_error_states->hww_warp_esr = nvgpu_readl(g,
+                gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset);
+        sm_error_states->hww_warp_esr_pc = hi32_lo32_to_u64((nvgpu_readl(g,
+                gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r() + offset)),
+                (nvgpu_readl(g,
+                gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset)));
+        sm_error_states->hww_global_esr_report_mask = nvgpu_readl(g,
+               gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset);
+        sm_error_states->hww_warp_esr_report_mask = nvgpu_readl(g,
+                gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset);
+}
 int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
                                struct channel_gk20a *fault_ch)
 {
        int sm_id;
-        struct gr_gk20a *gr = &g->gr;
        u32 offset, sm_per_tpc, tpc_id;
        u32 gpc_offset, gpc_tpc_offset;
+        struct nvgpu_tsg_sm_error_state *sm_error_states = NULL;
+        struct tsg_gk20a *tsg = NULL;
        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
@@ -3381,21 +3406,19 @@ int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
        offset = gpc_tpc_offset + gv11b_gr_sm_offset(g, sm);
-        gr->sm_error_states[sm_id].hww_global_esr = gk20a_readl(g,
+        if (fault_ch != NULL) {
-                gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset);
+                tsg = tsg_gk20a_from_ch(fault_ch);
+        }
-        gr->sm_error_states[sm_id].hww_warp_esr = gk20a_readl(g,
-                gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset);
-        gr->sm_error_states[sm_id].hww_warp_esr_pc = gk20a_readl(g,
-                gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset);
-        gr->sm_error_states[sm_id].hww_global_esr_report_mask = gk20a_readl(g,
+        if (tsg == NULL) {
-               gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset);
+                nvgpu_err(g, "no valid tsg");
+                goto record_fail;
+        }
-        gr->sm_error_states[sm_id].hww_warp_esr_report_mask = gk20a_readl(g,
+        sm_error_states = tsg->sm_error_states + sm_id;
-                gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset);
+        gv11b_gr_read_sm_error_state(g, offset, sm_error_states);
+record_fail:
        nvgpu_mutex_release(&g->dbg_sessions_lock);
        return sm_id;
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
index 0f29ea24..30cc7f0a 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
@@ -43,7 +43,7 @@ struct zbc_entry;
 struct zbc_query_params;
 struct nvgpu_gr_ctx;
 struct nvgpu_warpstate;
-struct nvgpu_gr_sm_error_state;
+struct nvgpu_tsg_sm_error_state;
 struct gr_ctx_desc;
 struct gr_gk20a_isr_data;
 struct gk20a_debug_output;
@@ -168,7 +168,7 @@ int gv11b_gr_sm_trigger_suspend(struct gk20a *g);
 void gv11b_gr_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state);
 int gv11b_gr_update_sm_error_state(struct gk20a *g,
                struct channel_gk20a *ch, u32 sm_id,
-                struct nvgpu_gr_sm_error_state *sm_error_state);
+                struct nvgpu_tsg_sm_error_state *sm_error_state);
 int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
        struct channel_gk20a *ch, u64 sms, bool enable);
 int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
index 39d68dd1..f7a58c87 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
@@ -22,8 +22,8 @@
 * DEALINGS IN THE SOFTWARE.
 */
-#ifndef __TEGRA_VGPU_H
+#ifndef TEGRA_VGPU_H
-#define __TEGRA_VGPU_H
+#define TEGRA_VGPU_H
 #include <nvgpu/types.h>
 #include <nvgpu/ecc.h>  /* For NVGPU_ECC_STAT_NAME_MAX_SIZE */
@@ -737,6 +737,7 @@ struct tegra_vgpu_channel_event_info {
 };
 struct tegra_vgpu_sm_esr_info {
+        u32 tsg_id;
        u32 sm_id;
        u32 hww_global_esr;
        u32 hww_warp_esr;
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
index fc1f7011..2f013029 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
@@ -1567,56 +1567,6 @@ out:
        return err;
 }
-static int nvgpu_gpu_read_single_sm_error_state(struct gk20a *g,
-                struct nvgpu_gpu_read_single_sm_error_state_args *args)
-{
-        struct gr_gk20a *gr = &g->gr;
-        struct nvgpu_gr_sm_error_state *sm_error_state;
-        struct nvgpu_gpu_sm_error_state_record sm_error_state_record;
-        u32 sm_id;
-        int err = 0;
-        sm_id = args->sm_id;
-        if (sm_id >= gr->no_of_sm)
-                return -EINVAL;
-        nvgpu_speculation_barrier();
-        sm_error_state = gr->sm_error_states + sm_id;
-        sm_error_state_record.global_esr =
-                sm_error_state->hww_global_esr;
-        sm_error_state_record.warp_esr =
-                sm_error_state->hww_warp_esr;
-        sm_error_state_record.warp_esr_pc =
-                sm_error_state->hww_warp_esr_pc;
-        sm_error_state_record.global_esr_report_mask =
-                sm_error_state->hww_global_esr_report_mask;
-        sm_error_state_record.warp_esr_report_mask =
-                sm_error_state->hww_warp_esr_report_mask;
-        if (args->record_size > 0) {
-                size_t write_size = sizeof(*sm_error_state);
-                if (write_size > args->record_size)
-                        write_size = args->record_size;
-                nvgpu_mutex_acquire(&g->dbg_sessions_lock);
-                err = copy_to_user((void __user *)(uintptr_t)
-                                                args->record_mem,
-                                   &sm_error_state_record,
-                                   write_size);
-                nvgpu_mutex_release(&g->dbg_sessions_lock);
-                if (err) {
-                        nvgpu_err(g, "copy_to_user failed!");
-                        return err;
-                }
-                args->record_size = write_size;
-        }
-        return 0;
-}
 long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct gk20a_ctrl_priv *priv = filp->private_data;
@@ -1925,11 +1875,6 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
                        (struct nvgpu_gpu_set_deterministic_opts_args *)buf);
                break;
-        case NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE:
-                err = nvgpu_gpu_read_single_sm_error_state(g,
-                        (struct nvgpu_gpu_read_single_sm_error_state_args *)buf);
-                break;
        default:
                nvgpu_log_info(g, "unrecognized gpu ioctl cmd: 0x%x", cmd);
                err = -ENOTTY;
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
index ff4fcdca..4ac4fb62 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
@@ -35,6 +35,7 @@
 #include "gk20a/gk20a.h"
 #include "gk20a/gr_gk20a.h"
+#include "gk20a/tsg_gk20a.h"
 #include "gk20a/regops_gk20a.h"
 #include "gk20a/dbg_gpu_gk20a.h"
 #include "os_linux.h"
@@ -271,20 +272,23 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
        u32 sm_id;
        struct channel_gk20a *ch;
        struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
-        struct nvgpu_gr_sm_error_state sm_error_state;
+        struct nvgpu_tsg_sm_error_state sm_error_state;
        int err = 0;
        /* Not currently supported in the virtual case */
-        if (g->is_virtual)
+        if (g->is_virtual) {
                return -ENOSYS;
+        }
        ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
-        if (!ch)
+        if (ch == NULL) {
                return -EINVAL;
+        }
        sm_id = args->sm_id;
-        if (sm_id >= gr->no_of_sm)
+        if (sm_id >= gr->no_of_sm) {
                return -EINVAL;
+        }
        nvgpu_speculation_barrier();
@@ -300,13 +304,15 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
                                args->sm_error_state_record_mem,
                          read_size);
                nvgpu_mutex_release(&g->dbg_sessions_lock);
-                if (err)
+                if (err != 0) {
                        return -ENOMEM;
+                }
        }
        err = gk20a_busy(g);
-        if (err)
+        if (err != 0) {
                return err;
+        }
        sm_error_state.hww_global_esr =
                sm_error_state_record.hww_global_esr;
@@ -335,18 +341,36 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
 {
        struct gk20a *g = dbg_s->g;
        struct gr_gk20a *gr = &g->gr;
-        struct nvgpu_gr_sm_error_state *sm_error_state;
+        struct nvgpu_tsg_sm_error_state *sm_error_state;
        struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
+        struct channel_gk20a *ch;
+        struct tsg_gk20a *tsg;
        u32 sm_id;
        int err = 0;
+        ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
+        if (ch == NULL) {
+                return -EINVAL;
+        }
+        tsg = tsg_gk20a_from_ch(ch);
+        if (tsg == NULL) {
+                nvgpu_err(g, "no valid tsg from ch");
+                return -EINVAL;
+        }
        sm_id = args->sm_id;
-        if (sm_id >= gr->no_of_sm)
+        if (sm_id >= gr->no_of_sm) {
                return -EINVAL;
+        }
+        if (tsg->sm_error_states == NULL) {
+                return -EINVAL;
+        }
        nvgpu_speculation_barrier();
-        sm_error_state = gr->sm_error_states + sm_id;
+        sm_error_state = tsg->sm_error_states + sm_id;
        sm_error_state_record.hww_global_esr =
                sm_error_state->hww_global_esr;
        sm_error_state_record.hww_warp_esr =
@@ -370,7 +394,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
                                   &sm_error_state_record,
                                   write_size);
                nvgpu_mutex_release(&g->dbg_sessions_lock);
-                if (err) {
+                if (err != 0) {
                        nvgpu_err(g, "copy_to_user failed!");
                        return err;
                }
@@ -1500,8 +1524,9 @@ static int nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state(
        int err = 0;
        ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
-        if (!ch)
+        if (ch == NULL) {
                return -EINVAL;
+        }
        sm_id = args->sm_id;
        if (sm_id >= gr->no_of_sm)
@@ -1510,8 +1535,9 @@ static int nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state(
        nvgpu_speculation_barrier();
        err = gk20a_busy(g);
-        if (err)
+        if (err != 0) {
                return err;
+        }
        err = gr_gk20a_elpg_protected_call(g,
                        g->ops.gr.clear_sm_error_state(g, ch, sm_id));
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c
index f7d20f34..6c68ca58 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c
@@ -536,6 +536,57 @@ static int gk20a_tsg_ioctl_get_timeslice(struct gk20a *g,
        return 0;
 }
+static int gk20a_tsg_ioctl_read_single_sm_error_state(struct gk20a *g,
+                struct tsg_gk20a *tsg,
+                struct nvgpu_tsg_read_single_sm_error_state_args *args)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct nvgpu_tsg_sm_error_state *sm_error_state;
+        struct nvgpu_tsg_sm_error_state_record sm_error_state_record;
+        u32 sm_id;
+        int err = 0;
+        sm_id = args->sm_id;
+        if (sm_id >= gr->no_of_sm)
+                return -EINVAL;
+        nvgpu_speculation_barrier();
+        sm_error_state = tsg->sm_error_states + sm_id;
+        sm_error_state_record.global_esr =
+                sm_error_state->hww_global_esr;
+        sm_error_state_record.warp_esr =
+                sm_error_state->hww_warp_esr;
+        sm_error_state_record.warp_esr_pc =
+                sm_error_state->hww_warp_esr_pc;
+        sm_error_state_record.global_esr_report_mask =
+                sm_error_state->hww_global_esr_report_mask;
+        sm_error_state_record.warp_esr_report_mask =
+                sm_error_state->hww_warp_esr_report_mask;
+        if (args->record_size > 0) {
+                size_t write_size = sizeof(*sm_error_state);
+                if (write_size > args->record_size)
+                        write_size = args->record_size;
+                nvgpu_mutex_acquire(&g->dbg_sessions_lock);
+                err = copy_to_user((void __user *)(uintptr_t)
+                                                args->record_mem,
+                                   &sm_error_state_record,
+                                   write_size);
+                nvgpu_mutex_release(&g->dbg_sessions_lock);
+                if (err) {
+                        nvgpu_err(g, "copy_to_user failed!");
+                        return err;
+                }
+                args->record_size = write_size;
+        }
+        return 0;
+}
 long nvgpu_ioctl_tsg_dev_ioctl(struct file *filp, unsigned int cmd,
                             unsigned long arg)
 {
@@ -670,6 +721,13 @@ long nvgpu_ioctl_tsg_dev_ioctl(struct file *filp, unsigned int cmd,
                break;
                }
+        case NVGPU_TSG_IOCTL_READ_SINGLE_SM_ERROR_STATE:
+                {
+                err = gk20a_tsg_ioctl_read_single_sm_error_state(g, tsg,
+                        (struct nvgpu_tsg_read_single_sm_error_state_args *)buf);
+                break;
+                }
        default:
                nvgpu_err(g, "unrecognized tsg gpu ioctl cmd: 0x%x",
                           cmd);
diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
index fa64cb82..9ee57fb4 100644
--- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
@@ -882,9 +882,6 @@ static void vgpu_remove_gr_support(struct gr_gk20a *gr)
        gk20a_comptag_allocator_destroy(gr->g, &gr->comp_tags);
-        nvgpu_kfree(gr->g, gr->sm_error_states);
-        gr->sm_error_states = NULL;
        nvgpu_kfree(gr->g, gr->gpc_tpc_mask);
        gr->gpc_tpc_mask = NULL;
@@ -935,14 +932,6 @@ static int vgpu_gr_init_gr_setup_sw(struct gk20a *g)
        nvgpu_mutex_init(&gr->ctx_mutex);
        nvgpu_spinlock_init(&gr->ch_tlb_lock);
-        gr->sm_error_states = nvgpu_kzalloc(g,
-                        sizeof(struct nvgpu_gr_sm_error_state) *
-                        gr->no_of_sm);
-        if (!gr->sm_error_states) {
-                err = -ENOMEM;
-                goto clean_up;
-        }
        gr->remove_support = vgpu_remove_gr_support;
        gr->sw_ready = true;
@@ -1152,12 +1141,17 @@ int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g,
 int vgpu_gr_clear_sm_error_state(struct gk20a *g,
                struct channel_gk20a *ch, u32 sm_id)
 {
-        struct gr_gk20a *gr = &g->gr;
        struct tegra_vgpu_cmd_msg msg;
        struct tegra_vgpu_clear_sm_error_state *p =
                        &msg.params.clear_sm_error_state;
+        struct tsg_gk20a *tsg;
        int err;
+        tsg = tsg_gk20a_from_ch(ch);
+        if (!tsg) {
+                return -EINVAL;
+        }
        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
        msg.cmd = TEGRA_VGPU_CMD_CLEAR_SM_ERROR_STATE;
        msg.handle = vgpu_get_handle(g);
@@ -1167,7 +1161,7 @@ int vgpu_gr_clear_sm_error_state(struct gk20a *g,
        err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
        WARN_ON(err || msg.ret);
-        memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states));
+        memset(&tsg->sm_error_states[sm_id], 0, sizeof(*tsg->sm_error_states));
        nvgpu_mutex_release(&g->dbg_sessions_lock);
        return err ? err : msg.ret;
@@ -1264,7 +1258,8 @@ int vgpu_gr_resume_contexts(struct gk20a *g,
 void vgpu_gr_handle_sm_esr_event(struct gk20a *g,
                        struct tegra_vgpu_sm_esr_info *info)
 {
-        struct nvgpu_gr_sm_error_state *sm_error_states;
+        struct nvgpu_tsg_sm_error_state *sm_error_states;
+        struct tsg_gk20a *tsg;
        if (info->sm_id >= g->gr.no_of_sm) {
                nvgpu_err(g, "invalid smd_id %d / %d",
@@ -1272,9 +1267,20 @@ void vgpu_gr_handle_sm_esr_event(struct gk20a *g,
                return;
        }
+        if (info->tsg_id >= g->fifo.num_channels) {
+                nvgpu_err(g, "invalid tsg_id in sm esr event");
+                return;
+        }
+        tsg = &g->fifo.tsg[info->tsg_id];
+        if (tsg == NULL) {
+                nvgpu_err(g, "invalid tsg");
+                return;
+        }
        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
-        sm_error_states = &g->gr.sm_error_states[info->sm_id];
+        sm_error_states = &tsg->sm_error_states[info->sm_id];
        sm_error_states->hww_global_esr = info->hww_global_esr;
        sm_error_states->hww_warp_esr = info->hww_warp_esr;
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 8d884872..b1dc4df4 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -861,38 +861,6 @@ struct nvgpu_gpu_set_deterministic_opts_args {
        __u64 channels; /* in */
 };
-/*
- * This struct helps to report the SM error state of a single SM.
- * This acts upon the currently resident GR context.
- * Global Error status register
- * Warp Error status register
- * Warp Error status register PC
- * Global Error status register Report Mask
- * Warp Error status register Report Mask
- */
-struct nvgpu_gpu_sm_error_state_record {
-        __u32 global_esr;
-        __u32 warp_esr;
-        __u64 warp_esr_pc;
-        __u32 global_esr_report_mask;
-        __u32 warp_esr_report_mask;
-};
-/*
- * This struct helps to read the SM error state.
- */
-struct nvgpu_gpu_read_single_sm_error_state_args {
-        /* Valid SM ID */
-        __u32 sm_id;
-        __u32 reserved;
-        /*
-         * This is pointer to the struct nvgpu_gpu_sm_error_state_record
-         */
-        __u64 record_mem;
-        /* size of the record size to read */
-        __u64 record_size;
-};
 #define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \
        _IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args)
 #define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \
@@ -976,11 +944,8 @@ struct nvgpu_gpu_read_single_sm_error_state_args {
 #define NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS \
        _IOWR(NVGPU_GPU_IOCTL_MAGIC, 40, \
                        struct nvgpu_gpu_set_deterministic_opts_args)
-#define NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE \
-        _IOWR(NVGPU_GPU_IOCTL_MAGIC, 41, \
-                        struct nvgpu_gpu_read_single_sm_error_state_args)
 #define NVGPU_GPU_IOCTL_LAST            \
-        _IOC_NR(NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE)
+        _IOC_NR(NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS)
 #define NVGPU_GPU_IOCTL_MAX_ARG_SIZE    \
        sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args)
@@ -1063,6 +1028,38 @@ struct nvgpu_tsg_bind_channel_ex_args {
        __u8 reserved[11];
 };
+/*
+ * This struct helps to report the SM error state of a single SM.
+ * This acts upon the currently resident TSG context.
+ * Global Error status register
+ * Warp Error status register
+ * Warp Error status register PC
+ * Global Error status register Report Mask
+ * Warp Error status register Report Mask
+ */
+struct nvgpu_tsg_sm_error_state_record {
+        __u32 global_esr;
+        __u32 warp_esr;
+        __u64 warp_esr_pc;
+        __u32 global_esr_report_mask;
+        __u32 warp_esr_report_mask;
+};
+/*
+ * This struct helps to read the SM error state.
+ */
+struct nvgpu_tsg_read_single_sm_error_state_args {
+        /* Valid SM ID */
+        __u32 sm_id;
+        __u32 reserved;
+        /*
+         * This is pointer to the struct nvgpu_gpu_sm_error_state_record
+         */
+        __u64 record_mem;
+        /* size of the record size to read */
+        __u64 record_size;
+};
 #define NVGPU_TSG_IOCTL_BIND_CHANNEL \
        _IOW(NVGPU_TSG_IOCTL_MAGIC, 1, int)
 #define NVGPU_TSG_IOCTL_UNBIND_CHANNEL \
@@ -1083,10 +1080,13 @@ struct nvgpu_tsg_bind_channel_ex_args {
        _IOR(NVGPU_TSG_IOCTL_MAGIC, 10, struct nvgpu_timeslice_args)
 #define NVGPU_TSG_IOCTL_BIND_CHANNEL_EX \
        _IOWR(NVGPU_TSG_IOCTL_MAGIC, 11, struct nvgpu_tsg_bind_channel_ex_args)
+#define NVGPU_TSG_IOCTL_READ_SINGLE_SM_ERROR_STATE \
+        _IOR(NVGPU_TSG_IOCTL_MAGIC, 12, \
+                        struct nvgpu_tsg_read_single_sm_error_state_args)
 #define NVGPU_TSG_IOCTL_MAX_ARG_SIZE    \
                sizeof(struct nvgpu_tsg_bind_channel_ex_args)
 #define NVGPU_TSG_IOCTL_LAST            \
-        _IOC_NR(NVGPU_TSG_IOCTL_BIND_CHANNEL_EX)
+        _IOC_NR(NVGPU_TSG_IOCTL_READ_SINGLE_SM_ERROR_STATE)
 /*
 * /dev/nvhost-dbg-gpu device
author	Vinod G <vinodg@nvidia.com>	2018-08-08 02:09:30 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2018-08-25 05:10:43 -0400
commit	bfe65407bde2b5d0776724301e215c6553c989f3 (patch)
tree	f68a01361052afe1c30a0c6dcd5d359b762e647a
parent	3bd47da0954d3486d9ccd3c396f84445918f82b4 (diff)