From 3fbb44d7576238d42635e2ca6501a17cdc7306f7 Mon Sep 17 00:00:00 2001 From: Seema Khowala Date: Thu, 16 Nov 2017 13:46:11 -0800 Subject: gpu: nvgpu: gv11b: channel/tsg recovery reorged Context TSG teardown procedure: 1. Disable scheduling for the engine's runlist via PFIFO_SCHED_DISABLE. This enables SW to determine whether a context has hung later in the process: otherwise, ongoing work on the runlist may keep ENG_STATUS from reaching a steady state. 2. Disable all channels in the TSG being torn down or submit a new runlist that does not contain the TSG. This is to prevent the TSG from being rescheduled once scheduling is reenabled in step 6. 3. a)Initiate a preempt of the TSG by writing NV_PFIFO_PREEMPT with the TSG's ID and the TYPE set to TSG if TSG id is known else do 3b b)Initiate a preempt of the engine by writing the bit associated with its runlist to NV_PFIFO_RUNLIST_PREEMPT. This allows us to begin the preempt process prior to doing the slow register reads needed to determine whether the context has hit any interrupts or is hung. Do not poll NV_PFIFO_RUNLIST_PREEMPT for the preempt to complete. 4. Check for preempt done 5. If a reset is needed as determined by step 4: a. Halt the memory interface for the engine (as per the relevant engine procedure). b. Reset the engine via PMC_ENABLE. c. Take the engine out of reset and reinit the engine (as per the relevant engine procedure) 6. Re-enable scheduling for the engine's runlist via PFIFO_SCHED_ENABLE. Bug 200277163 Change-Id: I1e945a2c6b9845f365d6952109f6803309aa2270 Signed-off-by: Seema Khowala Reviewed-on: https://git-master.nvidia.com/r/1599841 Reviewed-by: svc-mobile-coverity GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | 111 +++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 51 deletions(-) diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index 0238ae6c..ae2b6cfc 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c @@ -660,7 +660,7 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask, } } } - gk20a_dbg_info("runlists_mask = %08x", runlists_mask); + nvgpu_log(g, gpu_dbg_info, "runlists_mask = 0x%08x", runlists_mask); return runlists_mask; } @@ -873,9 +873,14 @@ static int gv11b_fifo_preempt_runlists(struct gk20a *g, u32 runlists_mask) nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) { - if (runlists_mask & fifo_runlist_preempt_runlist_m(runlist_id)) + if (runlists_mask & + fifo_runlist_preempt_runlist_m(runlist_id)) { + /* during recovery reset engs served by this runlist */ + g->fifo.runlist_info[runlist_id].reset_eng_bitmask = + g->fifo.runlist_info[runlist_id].eng_bitmask; nvgpu_mutex_release(&g->fifo. runlist_info[runlist_id].mutex); + } } return ret; @@ -955,11 +960,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, struct fifo_runlist_info_gk20a *runlist = NULL; u32 engine_id, client_type = ~0; - gk20a_dbg_info("active engine ids bitmask =0x%x", act_eng_bitmask); - gk20a_dbg_info("hw id =%d", id); - gk20a_dbg_info("id_type =%d", id_type); - gk20a_dbg_info("rc_type =%d", rc_type); - gk20a_dbg_info("mmu_fault =0x%p", mmfault); + nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, " + "act_eng_bitmask = 0x%x, mmfault ptr = 0x%p", + id, id_type, rc_type, act_eng_bitmask, mmfault); runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id, id_type, rc_type, mmfault); @@ -986,25 +989,29 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN); - if (rc_type == RC_TYPE_MMU_FAULT) + /* Get tsg/ch */ + if (rc_type == RC_TYPE_MMU_FAULT) { gk20a_debug_dump(g); - - /* get the channel/TSG */ - if (rc_type == RC_TYPE_MMU_FAULT && mmfault && mmfault->refch) { refch = mmfault->refch; client_type = mmfault->client_type; - if (gk20a_is_channel_marked_as_tsg(refch)) - tsg = &g->fifo.tsg[refch->tsgid]; gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch, mmfault->faulted_pbdma, mmfault->faulted_engine); - } else { - if (id_type == ID_TYPE_TSG) - tsg = &g->fifo.tsg[id]; - else if (id_type == ID_TYPE_CHANNEL) + } + + if (id_type == ID_TYPE_TSG) { + tsg = &g->fifo.tsg[id]; + } else if (id_type == ID_TYPE_CHANNEL) { + if (refch == NULL) refch = gk20a_channel_get(&g->fifo.channel[id]); } + /* Disable tsg/ch */ + if (tsg) + gk20a_disable_tsg(tsg); + else if (refch) + g->ops.fifo.disable_channel(refch); + /* Preempt tsg/ch */ if (id_type == ID_TYPE_TSG || id_type == ID_TYPE_CHANNEL) { g->ops.fifo.preempt_ch_tsg(g, id, id_type, PREEMPT_TIMEOUT_NORC); @@ -1012,35 +1019,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, gv11b_fifo_preempt_runlists(g, runlists_mask); } - if (tsg) { - if (!g->fifo.deferred_reset_pending) { - if (rc_type == RC_TYPE_MMU_FAULT) { - gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg); - verbose = gk20a_fifo_error_tsg(g, tsg); - } - } - gk20a_fifo_abort_tsg(g, tsg->tsgid, false); - if (refch) - gk20a_channel_put(refch); - } else if (refch) { - if (!g->fifo.deferred_reset_pending) { - if (rc_type == RC_TYPE_MMU_FAULT) { - gk20a_fifo_set_ctx_mmu_error_ch(g, refch); - verbose = gk20a_fifo_error_ch(g, refch); - } - } - gk20a_channel_abort(refch, false); - gk20a_channel_put(refch); - } else { - nvgpu_err(g, "id unknown, abort runlist"); - for (runlist_id = 0; runlist_id < g->fifo.max_runlists; - runlist_id++) { - if (runlists_mask & BIT(runlist_id)) - g->ops.fifo.update_runlist(g, runlist_id, - FIFO_INVAL_CHANNEL_ID, false, true); - } - } - /* check if engine reset should be deferred */ for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) { @@ -1051,7 +1029,8 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, unsigned long __reset_eng_bitmask = runlist->reset_eng_bitmask; - for_each_set_bit(engine_id, &__reset_eng_bitmask, 32) { + for_each_set_bit(engine_id, &__reset_eng_bitmask, + g->fifo.max_engines) { if ((refch || tsg) && gk20a_fifo_should_defer_engine_reset(g, engine_id, client_type, false)) { @@ -1061,7 +1040,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, /* handled during channel free */ g->fifo.deferred_reset_pending = true; - gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "sm debugger attached," " deferring channel recovery to channel free"); } else { @@ -1084,12 +1063,42 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, } #ifdef CONFIG_GK20A_CTXSW_TRACE - if (refch) - gk20a_ctxsw_trace_channel_reset(g, refch); - else if (tsg) + /* tsg and refch both could be valid for mmu fault. Check tsg first */ + if (tsg) gk20a_ctxsw_trace_tsg_reset(g, tsg); + else if (refch) + gk20a_ctxsw_trace_channel_reset(g, refch); #endif + if (tsg) { + if (!g->fifo.deferred_reset_pending) { + if (rc_type == RC_TYPE_MMU_FAULT) { + gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg); + verbose = gk20a_fifo_error_tsg(g, tsg); + } + } + gk20a_fifo_abort_tsg(g, tsg->tsgid, false); + if (refch) + gk20a_channel_put(refch); + } else if (refch) { + if (!g->fifo.deferred_reset_pending) { + if (rc_type == RC_TYPE_MMU_FAULT) { + gk20a_fifo_set_ctx_mmu_error_ch(g, refch); + verbose = gk20a_fifo_error_ch(g, refch); + } + } + gk20a_channel_abort(refch, false); + gk20a_channel_put(refch); + } else { + nvgpu_err(g, "id unknown, abort runlist"); + for (runlist_id = 0; runlist_id < g->fifo.max_runlists; + runlist_id++) { + if (runlists_mask & BIT(runlist_id)) + g->ops.fifo.update_runlist(g, runlist_id, + FIFO_INVAL_CHANNEL_ID, false, true); + } + } + gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED, !RUNLIST_INFO_MUTEX_LOCKED); -- cgit v1.2.2