From 30b571e31c7f8ee26fc58864272bc7c7e083d377 Mon Sep 17 00:00:00 2001 From: Anup Mahindre Date: Fri, 21 Sep 2018 11:52:47 +0530 Subject: gpu: nvgpu: Add gv11b_gr_clear_sm_error_state All chips were currently using gm20b_gr_clear_sm_error_state It was wrong for chips based on volta and later as the implementation didn't consider non pes-aware vsms mapping Add new HAL implementation for clear_sm_error_state for volta based and later chips to fix this. Bug 200448172 Change-Id: I65988c8cbb35d13089ac628e8333d9a3b58e0eb1 Signed-off-by: Anup Mahindre Reviewed-on: https://git-master.nvidia.com/r/1837188 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gv100/hal_gv100.c | 2 +- drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 53 +++++++++++++++++++++++++++++++++++++ drivers/gpu/nvgpu/gv11b/gr_gv11b.h | 2 ++ drivers/gpu/nvgpu/gv11b/hal_gv11b.c | 2 +- 4 files changed, 57 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index 4f50b13f..2a381f55 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -383,7 +383,7 @@ static const struct gpu_ops gv100_ops = { .update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode, .init_hwpm_pmm_register = gr_gv100_init_hwpm_pmm_register, .record_sm_error_state = gv11b_gr_record_sm_error_state, - .clear_sm_error_state = gm20b_gr_clear_sm_error_state, + .clear_sm_error_state = gv11b_gr_clear_sm_error_state, .suspend_contexts = gr_gp10b_suspend_contexts, .resume_contexts = gr_gk20a_resume_contexts, .get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags, diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 288bd583..bb76178e 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -5010,3 +5010,56 @@ int gr_gv11b_create_priv_addr_table(struct gk20a *g, *num_registers = t; return 0; } + +int gv11b_gr_clear_sm_error_state(struct gk20a *g, + struct channel_gk20a *ch, u32 sm_id) +{ + u32 gpc, tpc, sm, offset; + u32 val; + struct tsg_gk20a *tsg; + + int err = 0; + + tsg = tsg_gk20a_from_ch(ch); + if (tsg == NULL) { + return -EINVAL; + } + + nvgpu_mutex_acquire(&g->dbg_sessions_lock); + + (void)memset(&tsg->sm_error_states[sm_id], 0, sizeof(*tsg->sm_error_states)); + + err = gr_gk20a_disable_ctxsw(g); + if (err != 0) { + nvgpu_err(g, "unable to stop gr ctxsw"); + goto fail; + } + + if (gk20a_is_channel_ctx_resident(ch)) { + gpc = g->gr.sm_to_cluster[sm_id].gpc_index; + if (g->ops.gr.get_nonpes_aware_tpc != NULL) { + tpc = g->ops.gr.get_nonpes_aware_tpc(g, + g->gr.sm_to_cluster[sm_id].gpc_index, + g->gr.sm_to_cluster[sm_id].tpc_index); + } else { + tpc = g->gr.sm_to_cluster[sm_id].tpc_index; + } + sm = g->gr.sm_to_cluster[sm_id].sm_index; + + offset = gk20a_gr_gpc_offset(g, gpc) + + gk20a_gr_tpc_offset(g, tpc) + + gv11b_gr_sm_offset(g, sm); + + val = gk20a_readl(g, gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset); + gk20a_writel(g, gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset, + val); + gk20a_writel(g, gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset, + 0); + } + + err = gr_gk20a_enable_ctxsw(g); + +fail: + nvgpu_mutex_release(&g->dbg_sessions_lock); + return err; +} diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h index 20377acf..2f765336 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h @@ -170,6 +170,8 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g, struct channel_gk20a *ch, u64 sms, bool enable); int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, struct channel_gk20a *fault_ch); +int gv11b_gr_clear_sm_error_state(struct gk20a *g, + struct channel_gk20a *ch, u32 sm_id); void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g); bool gv11b_gr_sm_debugger_attached(struct gk20a *g); void gv11b_gr_suspend_single_sm(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index 6a2dae77..2548cd16 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -335,7 +335,7 @@ static const struct gpu_ops gv11b_ops = { .update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode, .init_hwpm_pmm_register = gr_gv100_init_hwpm_pmm_register, .record_sm_error_state = gv11b_gr_record_sm_error_state, - .clear_sm_error_state = gm20b_gr_clear_sm_error_state, + .clear_sm_error_state = gv11b_gr_clear_sm_error_state, .suspend_contexts = gr_gp10b_suspend_contexts, .resume_contexts = gr_gk20a_resume_contexts, .get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags, -- cgit v1.2.2