From bdaacf544127fcfaa474ccb5466aa93f81382416 Mon Sep 17 00:00:00 2001 From: Debarshi Dutta Date: Tue, 30 Apr 2019 14:33:05 +0530 Subject: gpu: nvgpu: disable elpg before ctxsw_disable if fecs is sent stop_ctxsw method, elpg entry/exit cannot happen and may timeout. It could manifest as different error signatures depending on when stop_ctxsw fecs method gets sent with respect to pmu elpg sequence. It could come as pmu halt or abort or maybe ext error too. If ctxsw failed to disable, do not read engine info and just abort tsg. Bug 2092051 Bug 2429295 Bug 2484211 Bug 1890287 Change-Id: I5f3ba07663bcafd3f0083d44c603420b0ccf6945 Signed-off-by: Seema Khowala Reviewed-on: https://git-master.nvidia.com/r/2014914 Signed-off-by: Debarshi Dutta Reviewed-on: https://git-master.nvidia.com/r/2018156 GVS: Gerrit_Virtual_Submit Reviewed-by: Bibek Basu Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 35 +++++++++++++++++++++++++---- drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 43 ++++++++++++++++++++++++++++++++++-- 2 files changed, 72 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/nvgpu/gk20a') diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index 6d89940a..b96372b4 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c @@ -1943,14 +1943,42 @@ void gk20a_fifo_recover_ch(struct gk20a *g, struct channel_gk20a *ch, void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg, bool verbose, u32 rc_type) { - u32 engines; + u32 engines = 0U; + int err; /* stop context switching to prevent engine assignments from changing until TSG is recovered */ nvgpu_mutex_acquire(&g->dbg_sessions_lock); - gr_gk20a_disable_ctxsw(g); - engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true); + /* disable tsg so that it does not get scheduled again */ + g->ops.fifo.disable_tsg(tsg); + + /* + * stop context switching to prevent engine assignments from + * changing until engine status is checked to make sure tsg + * being recovered is not loaded on the engines + */ + err = gr_gk20a_disable_ctxsw(g); + + if (err != 0) { + /* if failed to disable ctxsw, just abort tsg */ + nvgpu_err(g, "failed to disable ctxsw"); + } else { + /* recover engines if tsg is loaded on the engines */ + engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true); + + /* + * it is ok to enable ctxsw before tsg is recovered. If engines + * is 0, no engine recovery is needed and if it is non zero, + * gk20a_fifo_recover will call get_engines_mask_on_id again. + * By that time if tsg is not on the engine, engine need not + * be reset. + */ + err = gr_gk20a_enable_ctxsw(g); + if (err != 0) { + nvgpu_err(g, "failed to enable ctxsw"); + } + } if (engines) { gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose, @@ -1963,7 +1991,6 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg, gk20a_fifo_abort_tsg(g, tsg, false); } - gr_gk20a_enable_ctxsw(g); nvgpu_mutex_release(&g->dbg_sessions_lock); } diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index a4c1ce58..788ebf45 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -628,7 +628,14 @@ static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret) .cond.fail = GR_IS_UCODE_OP_EQUAL }, true); } -/* Stop processing (stall) context switches at FECS. */ +/** + * Stop processing (stall) context switches at FECS:- + * If fecs is sent stop_ctxsw method, elpg entry/exit cannot happen + * and may timeout. It could manifest as different error signatures + * depending on when stop_ctxsw fecs method gets sent with respect + * to pmu elpg sequence. It could come as pmu halt or abort or + * maybe ext error too. +*/ int gr_gk20a_disable_ctxsw(struct gk20a *g) { int err = 0; @@ -638,8 +645,24 @@ int gr_gk20a_disable_ctxsw(struct gk20a *g) nvgpu_mutex_acquire(&g->ctxsw_disable_lock); g->ctxsw_disable_count++; if (g->ctxsw_disable_count == 1) { - err = gr_gk20a_ctrl_ctxsw(g, + err = nvgpu_pg_elpg_disable(g); + if (err != 0) { + nvgpu_err(g, "failed to disable elpg. not safe to " + "stop_ctxsw"); + /* stop ctxsw command is not sent */ + g->ctxsw_disable_count--; + } else { + err = gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), NULL); + if (err != 0) { + nvgpu_err(g, "failed to stop fecs ctxsw"); + /* stop ctxsw failed */ + g->ctxsw_disable_count--; + } + } + } else { + nvgpu_log_info(g, "ctxsw disabled, ctxsw_disable_count: %d", + g->ctxsw_disable_count); } nvgpu_mutex_release(&g->ctxsw_disable_lock); @@ -654,12 +677,28 @@ int gr_gk20a_enable_ctxsw(struct gk20a *g) nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " "); nvgpu_mutex_acquire(&g->ctxsw_disable_lock); + + if (g->ctxsw_disable_count == 0) { + goto ctxsw_already_enabled; + } g->ctxsw_disable_count--; WARN_ON(g->ctxsw_disable_count < 0); if (g->ctxsw_disable_count == 0) { err = gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), NULL); + if (err != 0) { + nvgpu_err(g, "failed to start fecs ctxsw"); + } else { + if (nvgpu_pg_elpg_enable(g) != 0) { + nvgpu_err(g, "failed to enable elpg " + "after start_ctxsw"); + } + } + } else { + nvgpu_log_info(g, "ctxsw_disable_count: %d is not 0 yet", + g->ctxsw_disable_count); } +ctxsw_already_enabled: nvgpu_mutex_release(&g->ctxsw_disable_lock); return err; -- cgit v1.2.2