From 6a46965eb3b7b657c089142579ab20d6efefc0fc Mon Sep 17 00:00:00 2001 From: Richard Zhao Date: Wed, 6 Jun 2018 20:46:03 -0700 Subject: gpu: nvgpu: correct calculation of sm_id for .record_sm_error_state Starting with Volta, one TPC could have more than 1 SMs. So .record_sm_error_state needs to have sm number as parameter. Logic tpc id should be read from gr_gpc0_gpm_pd_sm_id_r. Let the function return logical sm_id. RM server will need it to nofify client. Jira EVLR-2643 Bug 200405202 Change-Id: Iffaff05b89b1c5058616b8a6bf50dd73bd4e52f6 Signed-off-by: Richard Zhao Reviewed-on: https://git-master.nvidia.com/r/1742165 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/gk20a.h | 4 ++-- drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 2 +- drivers/gpu/nvgpu/gm20b/gr_gm20b.c | 4 ++-- drivers/gpu/nvgpu/gm20b/gr_gm20b.h | 2 +- drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 18 ++++++++---------- drivers/gpu/nvgpu/gv11b/gr_gv11b.h | 2 +- 6 files changed, 15 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index a2b2e53f..49f2a34a 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -368,8 +368,8 @@ struct gpu_ops { void (*enable_exceptions)(struct gk20a *g); void (*create_gr_sysfs)(struct gk20a *g); u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g); - int (*record_sm_error_state)(struct gk20a *g, u32 gpc, - u32 tpc, struct channel_gk20a *fault_ch); + int (*record_sm_error_state)(struct gk20a *g, u32 gpc, u32 tpc, + u32 sm, struct channel_gk20a *fault_ch); int (*update_sm_error_state)(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id, struct nvgpu_gr_sm_error_state *sm_error_state); diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 37ac8748..a082cd92 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -5608,7 +5608,7 @@ int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr); gr_gk20a_elpg_protected_call(g, - g->ops.gr.record_sm_error_state(g, gpc, tpc, fault_ch)); + g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch)); if (g->ops.gr.pre_process_sm_exception) { ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm, diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c index 331c3af9..261c3054 100644 --- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c @@ -1278,7 +1278,7 @@ void gr_gm20b_get_access_map(struct gk20a *g, *num_entries = ARRAY_SIZE(wl_addr_gm20b); } -int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, +int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, struct channel_gk20a *fault_ch) { int sm_id; @@ -1306,7 +1306,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, nvgpu_mutex_release(&g->dbg_sessions_lock); - return 0; + return sm_id; } int gm20b_gr_update_sm_error_state(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h index ff32d8ff..5c82fd65 100644 --- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h +++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h @@ -116,7 +116,7 @@ void gr_gm20b_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state); void gr_gm20b_get_access_map(struct gk20a *g, u32 **whitelist, int *num_entries); int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, - u32 tpc, struct channel_gk20a *fault_ch); + u32 tpc, u32 sm, struct channel_gk20a *fault_ch); int gm20b_gr_update_sm_error_state(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id, struct nvgpu_gr_sm_error_state *sm_error_state); diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 9bd48fdc..f57be9dd 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -3263,24 +3263,22 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g, return err; } -int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, +int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, struct channel_gk20a *fault_ch) { int sm_id; struct gr_gk20a *gr = &g->gr; - u32 offset, sm, sm_per_tpc; - u32 gpc_tpc_offset; + u32 offset, sm_per_tpc, tpc_id; + u32 gpc_offset, gpc_tpc_offset; nvgpu_mutex_acquire(&g->dbg_sessions_lock); sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); - gpc_tpc_offset = gk20a_gr_gpc_offset(g, gpc) + - gk20a_gr_tpc_offset(g, tpc); + gpc_offset = gk20a_gr_gpc_offset(g, gpc); + gpc_tpc_offset = gpc_offset + gk20a_gr_tpc_offset(g, tpc); - sm_id = gr_gpc0_tpc0_sm_cfg_tpc_id_v(gk20a_readl(g, - gr_gpc0_tpc0_sm_cfg_r() + gpc_tpc_offset)); - - sm = sm_id % sm_per_tpc; + tpc_id = gk20a_readl(g, gr_gpc0_gpm_pd_sm_id_r(tpc) + gpc_offset); + sm_id = tpc_id * sm_per_tpc + sm; offset = gpc_tpc_offset + gv11b_gr_sm_offset(g, sm); @@ -3301,7 +3299,7 @@ int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, nvgpu_mutex_release(&g->dbg_sessions_lock); - return 0; + return sm_id; } void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g) diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h index b4a7e411..f6f05a3b 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h @@ -171,7 +171,7 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g, struct nvgpu_gr_sm_error_state *sm_error_state); int gv11b_gr_set_sm_debug_mode(struct gk20a *g, struct channel_gk20a *ch, u64 sms, bool enable); -int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, +int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, struct channel_gk20a *fault_ch); void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g); bool gv11b_gr_sm_debugger_attached(struct gk20a *g); -- cgit v1.2.2