gpu: nvgpu: correct calculation of sm_id for .record_sm_error_state

Starting with Volta, one TPC could have more than 1 SMs. So .record_sm_error_state needs to have sm number as parameter. Logic tpc id should be read from gr_gpc0_gpm_pd_sm_id_r. Let the function return logical sm_id. RM server will need it to nofify client. Jira EVLR-2643 Bug 200405202 Change-Id: Iffaff05b89b1c5058616b8a6bf50dd73bd4e52f6 Signed-off-by: Richard Zhao <rizhao@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1742165 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Richard Zhao <rizhao@nvidia.com> 2018-06-06 23:46:03 -0400
committer: Tejal Kudav <tkudav@nvidia.com> 2018-06-14 09:44:08 -0400
commit: 6a46965eb3b7b657c089142579ab20d6efefc0fc (patch)
tree: 60aa4098d4b50af9db21e316098cfbe35c9a4797 /drivers
parent: 7a5d498a711833990a9d8fc3f5d3f3e26bee301c (diff)
6 files changed, 15 insertions, 17 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index a2b2e53f..49f2a34a 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -368,8 +368,8 @@ struct gpu_ops {
                void (*enable_exceptions)(struct gk20a *g);
                void (*create_gr_sysfs)(struct gk20a *g);
                u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g);
-                int (*record_sm_error_state)(struct gk20a *g, u32 gpc,
+                int (*record_sm_error_state)(struct gk20a *g, u32 gpc, u32 tpc,
-                                u32 tpc, struct channel_gk20a *fault_ch);
+                                u32 sm, struct channel_gk20a *fault_ch);
                int (*update_sm_error_state)(struct gk20a *g,
                                struct channel_gk20a *ch, u32 sm_id,
                                struct nvgpu_gr_sm_error_state *sm_error_state);
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 37ac8748..a082cd92 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -5608,7 +5608,7 @@ int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
                  "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);
        gr_gk20a_elpg_protected_call(g,
-                g->ops.gr.record_sm_error_state(g, gpc, tpc, fault_ch));
+                g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch));
        if (g->ops.gr.pre_process_sm_exception) {
                ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index 331c3af9..261c3054 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -1278,7 +1278,7 @@ void gr_gm20b_get_access_map(struct gk20a *g,
        *num_entries = ARRAY_SIZE(wl_addr_gm20b);
 }
-int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc,
+int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
                                struct channel_gk20a *fault_ch)
 {
        int sm_id;
@@ -1306,7 +1306,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc,
        nvgpu_mutex_release(&g->dbg_sessions_lock);
-        return 0;
+        return sm_id;
 }
 int gm20b_gr_update_sm_error_state(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
index ff32d8ff..5c82fd65 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
@@ -116,7 +116,7 @@ void gr_gm20b_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state);
 void gr_gm20b_get_access_map(struct gk20a *g,
                                   u32 **whitelist, int *num_entries);
 int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc,
-                u32 tpc, struct channel_gk20a *fault_ch);
+                u32 tpc, u32 sm, struct channel_gk20a *fault_ch);
 int gm20b_gr_update_sm_error_state(struct gk20a *g,
                struct channel_gk20a *ch, u32 sm_id,
                struct nvgpu_gr_sm_error_state *sm_error_state);
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 9bd48fdc..f57be9dd 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -3263,24 +3263,22 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
        return err;
 }
-int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc,
+int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
                                struct channel_gk20a *fault_ch)
 {
        int sm_id;
        struct gr_gk20a *gr = &g->gr;
-        u32 offset, sm, sm_per_tpc;
+        u32 offset, sm_per_tpc, tpc_id;
-        u32 gpc_tpc_offset;
+        u32 gpc_offset, gpc_tpc_offset;
        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
        sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
-        gpc_tpc_offset = gk20a_gr_gpc_offset(g, gpc) +
+        gpc_offset = gk20a_gr_gpc_offset(g, gpc);
-                                 gk20a_gr_tpc_offset(g, tpc);
+        gpc_tpc_offset = gpc_offset + gk20a_gr_tpc_offset(g, tpc);
-        sm_id = gr_gpc0_tpc0_sm_cfg_tpc_id_v(gk20a_readl(g,
+        tpc_id = gk20a_readl(g, gr_gpc0_gpm_pd_sm_id_r(tpc) + gpc_offset);
-                        gr_gpc0_tpc0_sm_cfg_r() + gpc_tpc_offset));
+        sm_id = tpc_id * sm_per_tpc + sm;
-        sm = sm_id % sm_per_tpc;
        offset = gpc_tpc_offset + gv11b_gr_sm_offset(g, sm);
@@ -3301,7 +3299,7 @@ int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc,
        nvgpu_mutex_release(&g->dbg_sessions_lock);
-        return 0;
+        return sm_id;
 }
 void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
index b4a7e411..f6f05a3b 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
@@ -171,7 +171,7 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
                struct nvgpu_gr_sm_error_state *sm_error_state);
 int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
        struct channel_gk20a *ch, u64 sms, bool enable);
-int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc,
+int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
                struct channel_gk20a *fault_ch);
 void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g);
 bool gv11b_gr_sm_debugger_attached(struct gk20a *g);
author	Richard Zhao <rizhao@nvidia.com>	2018-06-06 23:46:03 -0400
committer	Tejal Kudav <tkudav@nvidia.com>	2018-06-14 09:44:08 -0400
commit	6a46965eb3b7b657c089142579ab20d6efefc0fc (patch)
tree	60aa4098d4b50af9db21e316098cfbe35c9a4797 /drivers
parent	7a5d498a711833990a9d8fc3f5d3f3e26bee301c (diff)