From 6509bb49da19ba9b19e3df64e473b01d54fd310d Mon Sep 17 00:00:00 2001 From: Debarshi Dutta Date: Tue, 30 Apr 2019 15:11:31 +0530 Subject: gpu: nvgpu: protect recovery with engines_reset_mutex Rename gr_reset_mutex to engines_reset_mutex and acquire it before initiating recovery. Recovery running in parallel with engine reset is not recommended. On hitting engine reset, h/w drops the ctxsw_status to INVALID in fifo_engine_status register. Also while the engine is held in reset h/w passes busy/idle straight through. fifo_engine_status registers are correct in that there is no context switch outstanding as the CTXSW is aborted when reset is asserted. Use deferred_reset_mutex to protect deferred_reset_pending variable If deferred_reset_pending is true then acquire engines_reset_mutex and call gk20a_fifo_deferred_reset. gk20a_fifo_deferred_reset would also check the value of deferred_reset_pending before initiating reset process Bug 2092051 Bug 2429295 Bug 2484211 Bug 1890287 Change-Id: I47de669a6203e0b2e9a8237ec4e4747339b9837c Signed-off-by: Seema Khowala Reviewed-on: https://git-master.nvidia.com/r/2022373 Signed-off-by: Debarshi Dutta (cherry-picked from cb91bf1e13740023903282d1c2271d9154e940ba in dev-main) Reviewed-on: https://git-master.nvidia.com/r/2024901 GVS: Gerrit_Virtual_Submit Reviewed-by: Bibek Basu Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/fifo/channel.c | 19 ++++----- drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 68 ++++++++++++++++++++++++++------- drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | 2 +- drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | 47 ++++++++++++----------- 4 files changed, 90 insertions(+), 46 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c index d30b8ded..4bea032a 100644 --- a/drivers/gpu/nvgpu/common/fifo/channel.c +++ b/drivers/gpu/nvgpu/common/fifo/channel.c @@ -308,6 +308,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force) struct dbg_session_data *session_data, *tmp_s; struct dbg_session_channel_data *ch_data, *tmp; int err; + bool deferred_reset_pending; nvgpu_log_fn(g, " "); @@ -381,17 +382,17 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force) /* if engine reset was deferred, perform it now */ nvgpu_mutex_acquire(&f->deferred_reset_mutex); - if (g->fifo.deferred_reset_pending) { + deferred_reset_pending = g->fifo.deferred_reset_pending; + nvgpu_mutex_release(&f->deferred_reset_mutex); + + if (deferred_reset_pending) { nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was" - " deferred, running now"); - /* if lock is already taken, a reset is taking place - so no need to repeat */ - if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) { - gk20a_fifo_deferred_reset(g, ch); - nvgpu_mutex_release(&g->fifo.gr_reset_mutex); - } + " deferred, running now"); + nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex); + gk20a_fifo_deferred_reset(g, ch); + nvgpu_mutex_release(&g->fifo.engines_reset_mutex); } - nvgpu_mutex_release(&f->deferred_reset_mutex); + if (!gk20a_channel_as_bound(ch)) { goto unbind; diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index b96372b4..5aca7d62 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c @@ -910,9 +910,9 @@ int gk20a_init_fifo_setup_sw_common(struct gk20a *g) return err; } - err = nvgpu_mutex_init(&f->gr_reset_mutex); + err = nvgpu_mutex_init(&f->engines_reset_mutex); if (err) { - nvgpu_err(g, "failed to init gr_reset_mutex"); + nvgpu_err(g, "failed to init engines_reset_mutex"); return err; } @@ -1581,14 +1581,22 @@ int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch) { unsigned long engine_id, engines = 0U; struct tsg_gk20a *tsg; + bool deferred_reset_pending; + struct fifo_gk20a *f = &g->fifo; nvgpu_mutex_acquire(&g->dbg_sessions_lock); - gr_gk20a_disable_ctxsw(g); - if (!g->fifo.deferred_reset_pending) { - goto clean_up; + nvgpu_mutex_acquire(&f->deferred_reset_mutex); + deferred_reset_pending = g->fifo.deferred_reset_pending; + nvgpu_mutex_release(&f->deferred_reset_mutex); + + if (!deferred_reset_pending) { + nvgpu_mutex_release(&g->dbg_sessions_lock); + return 0; } + gr_gk20a_disable_ctxsw(g); + tsg = tsg_gk20a_from_ch(ch); if (tsg != NULL) { engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true); @@ -1610,8 +1618,10 @@ int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch) } } + nvgpu_mutex_acquire(&f->deferred_reset_mutex); g->fifo.deferred_fault_engines = 0; g->fifo.deferred_reset_pending = false; + nvgpu_mutex_release(&f->deferred_reset_mutex); clean_up: gr_gk20a_enable_ctxsw(g); @@ -1632,9 +1642,10 @@ static bool gk20a_fifo_handle_mmu_fault_locked( bool verbose = true; u32 grfifo_ctl; - nvgpu_log_fn(g, " "); + bool deferred_reset_pending = false; + struct fifo_gk20a *f = &g->fifo; - g->fifo.deferred_reset_pending = false; + nvgpu_log_fn(g, " "); /* Disable power management */ if (g->support_pmu) { @@ -1661,6 +1672,9 @@ static bool gk20a_fifo_handle_mmu_fault_locked( gk20a_debug_dump(g); } + nvgpu_mutex_acquire(&f->deferred_reset_mutex); + g->fifo.deferred_reset_pending = false; + nvgpu_mutex_release(&f->deferred_reset_mutex); /* go through all faulted engines */ for_each_set_bit(engine_mmu_fault_id, &fault_id, 32) { @@ -1761,17 +1775,17 @@ static bool gk20a_fifo_handle_mmu_fault_locked( g->fifo.deferred_fault_engines |= BIT(engine_id); /* handled during channel free */ + nvgpu_mutex_acquire(&f->deferred_reset_mutex); g->fifo.deferred_reset_pending = true; + nvgpu_mutex_release(&f->deferred_reset_mutex); + + deferred_reset_pending = true; + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "sm debugger attached," " deferring channel recovery to channel free"); } else { - /* if lock is already taken, a reset is taking place - so no need to repeat */ - if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) { - gk20a_fifo_reset_engine(g, engine_id); - nvgpu_mutex_release(&g->fifo.gr_reset_mutex); - } + gk20a_fifo_reset_engine(g, engine_id); } } @@ -1784,7 +1798,7 @@ static bool gk20a_fifo_handle_mmu_fault_locked( * Disable the channel/TSG from hw and increment syncpoints. */ if (tsg) { - if (g->fifo.deferred_reset_pending) { + if (deferred_reset_pending) { gk20a_disable_tsg(tsg); } else { if (!fake_fault) { @@ -1847,6 +1861,9 @@ static bool gk20a_fifo_handle_mmu_fault( nvgpu_log_fn(g, " "); + nvgpu_log_info(g, "acquire engines_reset_mutex"); + nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex); + nvgpu_log_info(g, "acquire runlist_lock for all runlists"); for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock); @@ -1859,6 +1876,10 @@ static bool gk20a_fifo_handle_mmu_fault( for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock); } + + nvgpu_log_info(g, "release engines_reset_mutex"); + nvgpu_mutex_release(&g->fifo.engines_reset_mutex); + return verbose; } @@ -1953,6 +1974,16 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg, /* disable tsg so that it does not get scheduled again */ g->ops.fifo.disable_tsg(tsg); + /* + * On hitting engine reset, h/w drops the ctxsw_status to INVALID in + * fifo_engine_status register. Also while the engine is held in reset + * h/w passes busy/idle straight through. fifo_engine_status registers + * are correct in that there is no context switch outstanding + * as the CTXSW is aborted when reset is asserted. + */ + nvgpu_log_info(g, "acquire engines_reset_mutex"); + nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex); + /* * stop context switching to prevent engine assignments from * changing until engine status is checked to make sure tsg @@ -1980,6 +2011,9 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg, } } + nvgpu_log_info(g, "release engines_reset_mutex"); + nvgpu_mutex_release(&g->fifo.engines_reset_mutex); + if (engines) { gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose, rc_type); @@ -2030,6 +2064,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids, bool id_is_tsg = (id_type == ID_TYPE_TSG) ? true : false; u32 rlid; + nvgpu_log_info(g, "acquire engines_reset_mutex"); + nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex); + nvgpu_log_info(g, "acquire runlist_lock for all runlists"); for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock); @@ -2094,6 +2131,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids, for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock); } + + nvgpu_log_info(g, "release engines_reset_mutex"); + nvgpu_mutex_release(&g->fifo.engines_reset_mutex); } void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 0c9d9101..26365cae 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h @@ -184,7 +184,7 @@ struct fifo_gk20a { /* zero-kref'd channels here */ struct nvgpu_list_node free_chs; struct nvgpu_mutex free_chs_mutex; - struct nvgpu_mutex gr_reset_mutex; + struct nvgpu_mutex engines_reset_mutex; struct tsg_gk20a *tsg; struct nvgpu_mutex tsg_inuse_mutex; diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index b3c59f84..3c2de4f2 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c @@ -1024,6 +1024,11 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, u32 num_runlists = 0; unsigned long runlist_served_pbdmas; + bool deferred_reset_pending = false; + + nvgpu_log_info(g, "acquire engines_reset_mutex"); + nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex); + nvgpu_log_fn(g, "acquire runlist_lock for all runlists"); for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { nvgpu_mutex_acquire(&f->runlist_info[rlid]. @@ -1094,8 +1099,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, /* Disable runlist scheduler */ gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED); - g->fifo.deferred_reset_pending = false; - /* Disable power management */ if (g->support_pmu) { if (nvgpu_cg_pg_disable(g) != 0) { @@ -1143,6 +1146,10 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, } } + nvgpu_mutex_acquire(&f->deferred_reset_mutex); + g->fifo.deferred_reset_pending = false; + nvgpu_mutex_release(&f->deferred_reset_mutex); + /* check if engine reset should be deferred */ for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { @@ -1159,28 +1166,21 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, gk20a_fifo_should_defer_engine_reset(g, engine_id, client_type, false)) { - g->fifo.deferred_fault_engines |= + g->fifo.deferred_fault_engines |= BIT(engine_id); - /* handled during channel free */ - g->fifo.deferred_reset_pending = true; - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "sm debugger attached," - " deferring channel recovery to channel free"); + /* handled during channel free */ + nvgpu_mutex_acquire(&f->deferred_reset_mutex); + g->fifo.deferred_reset_pending = true; + nvgpu_mutex_release(&f->deferred_reset_mutex); + + deferred_reset_pending = true; + + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "sm debugger attached," + " deferring channel recovery to channel free"); } else { - /* - * if lock is already taken, a reset is - * taking place so no need to repeat - */ - if (nvgpu_mutex_tryacquire( - &g->fifo.gr_reset_mutex)) { - - gk20a_fifo_reset_engine(g, - engine_id); - - nvgpu_mutex_release( - &g->fifo.gr_reset_mutex); - } + gk20a_fifo_reset_engine(g, engine_id); } } } @@ -1191,7 +1191,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, gk20a_ctxsw_trace_tsg_reset(g, tsg); #endif if (tsg) { - if (g->fifo.deferred_reset_pending) { + if (deferred_reset_pending) { gk20a_disable_tsg(tsg); } else { if (rc_type == RC_TYPE_MMU_FAULT) { @@ -1228,6 +1228,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, runlist_lock); } } + + nvgpu_log_info(g, "release engines_reset_mutex"); + nvgpu_mutex_release(&g->fifo.engines_reset_mutex); } void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f) -- cgit v1.2.2