10 files changed, 397 insertions, 230 deletions
diff --git a/drivers/gpu/nvgpu/common/fb/fb_gv11b.c b/drivers/gpu/nvgpu/common/fb/fb_gv11b.c
index 69a71575..26dabd72 100644
--- a/drivers/gpu/nvgpu/common/fb/fb_gv11b.c
+++ b/drivers/gpu/nvgpu/common/fb/fb_gv11b.c
@@ -792,10 +792,11 @@ static void gv11b_fb_copy_from_hw_fault_buf(struct gk20a *g,
 static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                 struct mmu_fault_info *mmfault, u32 *invalidate_replay_val)
 {
-        unsigned int id_type;
+        unsigned int id_type = ID_TYPE_UNKNOWN;
        u32 num_lce, act_eng_bitmask = 0;
        int err = 0;
-        u32 id = ((u32)~0);
+        u32 id = FIFO_INVAL_TSG_ID;
+        unsigned int rc_type = RC_TYPE_NO_RC;
        if (!mmfault->valid)
                return;
@@ -810,18 +811,23 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                /* CE page faults are not reported as replayable */
                nvgpu_log(g, gpu_dbg_intr, "CE Faulted");
                err = gv11b_fb_fix_page_fault(g, mmfault);
-                gv11b_fifo_reset_pbdma_and_eng_faulted(g, mmfault->refch,
+                if (mmfault->refch &&
-                        mmfault->faulted_pbdma, mmfault->faulted_engine);
+                        (u32)mmfault->refch->tsgid != FIFO_INVAL_TSG_ID) {
+                        gv11b_fifo_reset_pbdma_and_eng_faulted(g,
+                                &g->fifo.tsg[mmfault->refch->tsgid],
+                                mmfault->faulted_pbdma,
+                                mmfault->faulted_engine);
+                }
                if (!err) {
                        nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Fixed");
                        *invalidate_replay_val = 0;
-                        /* refch in mmfault is assigned at the time of copying
+                        if (mmfault->refch) {
-                         * fault info from snap reg or bar2 fault buf
+                                gk20a_channel_put(mmfault->refch);
-                         */
+                                mmfault->refch = NULL;
-                        gk20a_channel_put(mmfault->refch);
+                        }
                        return;
                }
-                /* Do recovery. Channel recovery needs refch */
+                /* Do recovery */
                nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Not Fixed");
        }
@@ -833,16 +839,9 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                 * instance block, the fault cannot be isolated to a
                 * single context so we need to reset the entire runlist
                 */
-                id_type = ID_TYPE_UNKNOWN;
+                        rc_type = RC_TYPE_MMU_FAULT;
                } else if (mmfault->refch) {
-                        if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) {
-                                id = mmfault->refch->tsgid;
-                                id_type = ID_TYPE_TSG;
-                        } else {
-                                id = mmfault->chid;
-                                id_type = ID_TYPE_CHANNEL;
-                        }
                        if (mmfault->refch->mmu_nack_handled) {
                                /* We have already recovered for the same
                                 * context, skip doing another recovery.
@@ -863,19 +862,40 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                                 */
                                gk20a_channel_put(mmfault->refch);
                                return;
+                        } else {
+                                /* Indicate recovery is handled if mmu fault is
+                                 * a result of mmu nack.
+                                 */
+                                mmfault->refch->mmu_nack_handled = true;
+                        }
+                        rc_type = RC_TYPE_MMU_FAULT;
+                        if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) {
+                                id = mmfault->refch->tsgid;
+                                if (id != FIFO_INVAL_TSG_ID)
+                                        id_type = ID_TYPE_TSG;
+                        } else {
+                                nvgpu_err(g, "bare channels not supported");
                        }
-                } else {
-                        id_type = ID_TYPE_UNKNOWN;
                }
-                if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
+                /* engine is faulted */
+                if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID) {
                        act_eng_bitmask = BIT(mmfault->faulted_engine);
+                        rc_type = RC_TYPE_MMU_FAULT;
+                }
-                /* Indicate recovery is handled if mmu fault is a result of
+                /* refch in mmfault is assigned at the time of copying
-                 * mmu nack.
+                 * fault info from snap reg or bar2 fault buf
                 */
-                mmfault->refch->mmu_nack_handled = true;
+                if (mmfault->refch) {
-                g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
+                        gk20a_channel_put(mmfault->refch);
-                        id, id_type, RC_TYPE_MMU_FAULT, mmfault);
+                        mmfault->refch = NULL;
+                }
+                if (rc_type != RC_TYPE_NO_RC)
+                        g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
+                                id, id_type, rc_type, mmfault);
        } else {
                if (mmfault->fault_type == gmmu_fault_type_pte_v()) {
                        nvgpu_log(g, gpu_dbg_intr, "invalid pte! try to fix");
@@ -894,7 +914,10 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
                /* refch in mmfault is assigned at the time of copying
                 * fault info from snap reg or bar2 fault buf
                 */
-                gk20a_channel_put(mmfault->refch);
+                if (mmfault->refch) {
+                        gk20a_channel_put(mmfault->refch);
+                        mmfault->refch = NULL;
+                }
        }
 }
@@ -985,8 +1008,10 @@ void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g,
                        next_fault_addr = mmfault->fault_addr;
                        if (prev_fault_addr == next_fault_addr) {
                                nvgpu_log(g, gpu_dbg_intr, "pte already scanned");
-                                if (mmfault->refch)
+                                if (mmfault->refch) {
                                        gk20a_channel_put(mmfault->refch);
+                                        mmfault->refch = NULL;
+                                }
                                continue;
                        }
                }
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index cd54baf1..57cb0019 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -55,9 +55,7 @@
 #define FECS_METHOD_WFI_RESTORE 0x80000
 #define FECS_MAILBOX_0_ACK_RESTORE 0x4
-static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
-                                            u32 chid, bool add,
-                                            bool wait_for_finish);
 static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg);
 static const char *const pbdma_intr_fault_type_desc[] = {
@@ -2708,7 +2706,7 @@ void gk20a_fifo_issue_preempt(struct gk20a *g, u32 id, bool is_tsg)
 }
 int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id,
-                unsigned int id_type, unsigned int timeout_rc_type)
+                unsigned int id_type)
 {
        struct nvgpu_timeout timeout;
        u32 delay = GR_IDLE_CHECK_DEFAULT;
@@ -2781,8 +2779,8 @@ int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
        id_type = is_tsg ? ID_TYPE_TSG : ID_TYPE_CHANNEL;
        /* wait for preempt */
-        ret = g->ops.fifo.is_preempt_pending(g, id, id_type,
+        ret = g->ops.fifo.is_preempt_pending(g, id, id_type);
-                                         PREEMPT_TIMEOUT_RC);
        return ret;
 }
@@ -3279,7 +3277,7 @@ void gk20a_fifo_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
                fifo_eng_runlist_length_f(count));
 }
-static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
+int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
                                            u32 chid, bool add,
                                            bool wait_for_finish)
 {
@@ -3452,8 +3450,7 @@ static int __locked_fifo_reschedule_preempt_next(struct channel_gk20a *ch,
                gk20a_readl(g, fifo_preempt_r()));
 #endif
        if (wait_preempt) {
-                g->ops.fifo.is_preempt_pending(
+                g->ops.fifo.is_preempt_pending(g, preempt_id, preempt_type);
-                        g, preempt_id, preempt_type, PREEMPT_TIMEOUT_RC);
        }
 #ifdef TRACEPOINTS_ENABLED
        trace_gk20a_reschedule_preempted_next(ch->chid);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index bccd15f6..77030c94 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -50,9 +50,6 @@ enum {
 #define ID_TYPE_TSG                     1
 #define ID_TYPE_UNKNOWN                 ((u32)~0)
-#define PREEMPT_TIMEOUT_RC              1
-#define PREEMPT_TIMEOUT_NORC            0
 #define RC_YES                          1
 #define RC_NO                           0
@@ -257,6 +254,9 @@ int nvgpu_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next,
 int gk20a_fifo_update_runlist(struct gk20a *g, u32 engine_id, u32 chid,
                              bool add, bool wait_for_finish);
+int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
+                                            u32 chid, bool add,
+                                            bool wait_for_finish);
 int gk20a_fifo_suspend(struct gk20a *g);
 bool gk20a_fifo_mmu_fault_pending(struct gk20a *g);
@@ -390,8 +390,8 @@ void gk20a_fifo_channel_unbind(struct channel_gk20a *ch_gk20a);
 u32 gk20a_fifo_intr_0_error_mask(struct gk20a *g);
-int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id, unsigned int id_type,
+int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id,
-                                         unsigned int timeout_rc_type);
+                        unsigned int id_type);
 int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg);
 void gk20a_fifo_preempt_timeout_rc(struct gk20a *g, u32 id,
                                         unsigned int id_type);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index d6e0342b..17b0a60b 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -685,9 +685,9 @@ struct gpu_ops {
                                struct ch_state *ch_state);
                u32 (*intr_0_error_mask)(struct gk20a *g);
                int (*is_preempt_pending)(struct gk20a *g, u32 id,
-                        unsigned int id_type, unsigned int timeout_rc_type);
+                        unsigned int id_type);
                int (*preempt_ch_tsg)(struct gk20a *g, u32 id,
-                        unsigned int id_type, unsigned int timeout_rc_type);
+                        unsigned int id_type);
                void (*init_pbdma_intr_descs)(struct fifo_gk20a *f);
                int (*reset_enable_hw)(struct gk20a *g);
                int (*setup_userd)(struct channel_gk20a *c);
@@ -1132,7 +1132,7 @@ struct gpu_ops {
                bool (*is_intr_hub_pending)(struct gk20a *g, u32 mc_intr);
                bool (*is_intr_nvlink_pending)(struct gk20a *g, u32 mc_intr);
                bool (*is_stall_and_eng_intr_pending)(struct gk20a *g,
-                                                                u32 act_eng_id);
+                                        u32 act_eng_id, u32 *eng_intr_pending);
                u32 (*intr_stall)(struct gk20a *g);
                void (*intr_stall_pause)(struct gk20a *g);
                void (*intr_stall_resume)(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/gv100/mc_gv100.c b/drivers/gpu/nvgpu/gv100/mc_gv100.c
index 46af100a..7d38a3fb 100644
--- a/drivers/gpu/nvgpu/gv100/mc_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/mc_gv100.c
@@ -66,15 +66,14 @@ bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0)
        return (((mc_intr_0 & mc_intr_nvlink_pending_f()) != 0U) ? true : false);
 }
-bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id)
+bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
+                        u32 *eng_intr_pending)
 {
        u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0));
        u32 stall_intr, eng_intr_mask;
        eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id);
-        if ((mc_intr_0 & eng_intr_mask) != 0U) {
+        *eng_intr_pending = mc_intr_0 & eng_intr_mask;
-                return true;
-        }
        stall_intr = mc_intr_pfifo_pending_f() |
                        mc_intr_hub_pending_f() |
@@ -82,9 +81,10 @@ bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id)
                        mc_intr_pbus_pending_f() |
                        mc_intr_ltc_pending_f() |
                        mc_intr_nvlink_pending_f();
-        if ((mc_intr_0 & stall_intr) != 0U) {
-                return true;
-        }
-        return false;
+        nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
+                "mc_intr_0 = 0x%08x, eng_intr = 0x%08x",
+                mc_intr_0 & stall_intr, *eng_intr_pending);
+        return (mc_intr_0 & (eng_intr_mask | stall_intr)) != 0U;
 }
diff --git a/drivers/gpu/nvgpu/gv100/mc_gv100.h b/drivers/gpu/nvgpu/gv100/mc_gv100.h
index 4aff4a36..e9069258 100644
--- a/drivers/gpu/nvgpu/gv100/mc_gv100.h
+++ b/drivers/gpu/nvgpu/gv100/mc_gv100.h
@@ -26,5 +26,6 @@ struct gk20a;
 void mc_gv100_intr_enable(struct gk20a *g);
 bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0);
-bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id);
+bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
+                        u32 *eng_intr_pending);
 #endif
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 4edaaac1..f30f2ae1 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -387,17 +387,24 @@ u32 gv11b_fifo_intr_0_error_mask(struct gk20a *g)
 u32 gv11b_fifo_get_preempt_timeout(struct gk20a *g)
 {
-        return gk20a_get_gr_idle_timeout(g);
+        /* if timeouts are enabled, using 3000ms timeout
+         * for polling pdma/eng/runlist might kick in
+         * timeout handler in the cases where preempt
+         * is stuck. Use 1000ms timeout for polling when
+         * timeouts are enabled */
+        return nvgpu_is_timeouts_enabled(g) ? PREEMPT_TIMEOUT_1000_MS :
+                g->gr_idle_timeout_default;
 }
 static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
-                                 u32 pbdma_id, unsigned int timeout_rc_type)
+                                 u32 pbdma_id)
 {
        struct nvgpu_timeout timeout;
        unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */
        u32 pbdma_stat;
        u32 chan_stat;
        int ret = -EBUSY;
+        unsigned int loop_count = 0;
        /* timeout in milli seconds */
        nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -406,6 +413,14 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
        nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id);
        /* Verify that ch/tsg is no longer on the pbdma */
        do {
+                if (!nvgpu_platform_is_silicon(g)) {
+                        if (loop_count >= MAX_PRE_SI_RETRIES) {
+                                nvgpu_err(g, "preempt pbdma retries: %u",
+                                        loop_count);
+                                break;
+                        }
+                        loop_count++;
+                }
                /*
                 * If the PBDMA has a stalling interrupt and receives a NACK,
                 * the PBDMA won't save out until the STALLING interrupt is
@@ -458,21 +473,24 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
                nvgpu_usleep_range(delay, delay * 2);
                delay = min_t(unsigned long,
                                delay << 1, GR_IDLE_CHECK_MAX);
-        } while (!nvgpu_timeout_expired_msg(&timeout,
+        } while (!nvgpu_timeout_expired(&timeout));
-                                 "preempt timeout pbdma"));
+        if (ret)
+                nvgpu_err(g, "preempt timeout pbdma: %u pbdma_stat: %u "
+                                "tsgid: %u", pbdma_id, pbdma_stat, id);
        return ret;
 }
 static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
-                         u32 act_eng_id, u32 *reset_eng_bitmask,
+                         u32 act_eng_id, u32 *reset_eng_bitmask)
-                         unsigned int timeout_rc_type)
 {
        struct nvgpu_timeout timeout;
        unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */
        u32 eng_stat;
        u32 ctx_stat;
        int ret = -EBUSY;
-        bool stall_intr = false;
+        unsigned int loop_count = 0;
+        u32 eng_intr_pending;
        /* timeout in milli seconds */
        nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -482,20 +500,56 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
                        act_eng_id);
        /* Check if ch/tsg has saved off the engine or if ctxsw is hung */
        do {
+                if (!nvgpu_platform_is_silicon(g)) {
+                        if (loop_count >= MAX_PRE_SI_RETRIES) {
+                                nvgpu_err(g, "preempt eng retries: %u",
+                                        loop_count);
+                                break;
+                        }
+                        loop_count++;
+                }
                eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id));
                ctx_stat  = fifo_engine_status_ctx_status_v(eng_stat);
-                if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id)) {
+                if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id,
-                        stall_intr = true;
+                                        &eng_intr_pending)) {
+                /* From h/w team
+                 * Engine save can be blocked by eng  stalling interrupts.
+                 * FIFO interrupts shouldn’t block an engine save from
+                 * finishing, but could block FIFO from reporting preempt done.
+                 * No immediate reason to reset the engine if FIFO interrupt is
+                 * pending.
+                 * The hub, priv_ring, and ltc interrupts could block context
+                 * switch (or memory), but doesn’t necessarily have to.
+                 * For Hub interrupts they just report access counters and page
+                 * faults. Neither of these necessarily block context switch
+                 * or preemption, but they could.
+                 * For example a page fault for graphics would prevent graphics
+                 * from saving out. An access counter interrupt is a
+                 * notification and has no effect.
+                 * SW should handle page faults though for preempt to complete.
+                 * PRI interrupt (due to a failed PRI transaction) will result
+                 * in ctxsw failure reported to HOST.
+                 * LTC interrupts are generally ECC related and if so,
+                 * certainly don’t block preemption/ctxsw but they could.
+                 * Bus interrupts shouldn’t have anything to do with preemption
+                 * state as they are part of the Host EXT pipe, though they may
+                 * exhibit a symptom that indicates that GPU is in a bad state.
+                 * To be completely fair, when an engine is preempting SW
+                 * really should just handle other interrupts as they come in.
+                 * It’s generally bad to just poll and wait on a preempt
+                 * to complete since there are many things in the GPU which may
+                 * cause a system to hang/stop responding.
+                 */
                        nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
                                        "stall intr set, "
-                                        "preemption will not finish");
+                                        "preemption might not finish");
                }
                if (ctx_stat ==
                         fifo_engine_status_ctx_status_ctxsw_switch_v()) {
                        /* Eng save hasn't started yet. Continue polling */
-                        if (stall_intr) {
+                        if (eng_intr_pending) {
-                                /* if stall intr stop polling */
+                                /* if eng intr, stop polling */
                                *reset_eng_bitmask |= BIT(act_eng_id);
                                ret = 0;
                                break;
@@ -507,8 +561,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
                         fifo_engine_status_ctx_status_ctxsw_save_v()) {
                        if (id == fifo_engine_status_id_v(eng_stat)) {
-                                if (stall_intr ||
+                                if (eng_intr_pending) {
-                                        timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
                                        /* preemption will not finish */
                                        *reset_eng_bitmask |= BIT(act_eng_id);
                                        ret = 0;
@@ -524,9 +577,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
                         fifo_engine_status_ctx_status_ctxsw_load_v()) {
                        if (id == fifo_engine_status_next_id_v(eng_stat)) {
+                                if (eng_intr_pending) {
-                                if (stall_intr ||
-                                        timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
                                        /* preemption will not finish */
                                        *reset_eng_bitmask |= BIT(act_eng_id);
                                        ret = 0;
@@ -546,8 +597,21 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
                nvgpu_usleep_range(delay, delay * 2);
                delay = min_t(unsigned long,
                                delay << 1, GR_IDLE_CHECK_MAX);
-        } while (!nvgpu_timeout_expired_msg(&timeout,
+        } while (!nvgpu_timeout_expired(&timeout));
-                                 "preempt timeout eng"));
+        if (ret) {
+                /*
+                * The reasons a preempt can fail are:
+                * 1.Some other stalling interrupt is asserted preventing
+                *   channel or context save.
+                * 2.The memory system hangs.
+                * 3.The engine hangs during CTXSW.
+                */
+                nvgpu_err(g, "preempt timeout eng: %u ctx_stat: %u tsgid: %u",
+                        act_eng_id, ctx_stat, id);
+                *reset_eng_bitmask |= BIT(act_eng_id);
+        }
        return ret;
 }
@@ -594,29 +658,19 @@ static void gv11b_reset_pbdma_faulted_tsg(struct tsg_gk20a *tsg)
 }
 void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
-                        struct channel_gk20a *refch,
+                        struct tsg_gk20a *tsg,
                        u32 faulted_pbdma, u32 faulted_engine)
 {
-        struct tsg_gk20a *tsg;
+        if (!tsg)
+                return;
        nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
                                faulted_pbdma, faulted_engine);
-        if (!refch)
+        if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
-                return;
+                gv11b_reset_pbdma_faulted_tsg(tsg);
+        if (faulted_engine != FIFO_INVAL_ENGINE_ID)
-        if (gk20a_is_channel_marked_as_tsg(refch)) {
+                gv11b_reset_eng_faulted_tsg(tsg);
-                tsg = &g->fifo.tsg[refch->tsgid];
-                if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
-                        gv11b_reset_pbdma_faulted_tsg(tsg);
-                if (faulted_engine != FIFO_INVAL_ENGINE_ID)
-                        gv11b_reset_eng_faulted_tsg(tsg);
-        } else {
-                if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
-                        gv11b_reset_pbdma_faulted_ch(g, refch->chid);
-                if (faulted_engine != FIFO_INVAL_ENGINE_ID)
-                        gv11b_reset_eng_faulted_ch(g, refch->chid);
-        }
 }
 static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
@@ -626,7 +680,7 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
        u32 runlists_mask = 0;
        struct fifo_gk20a *f = &g->fifo;
        struct fifo_runlist_info_gk20a *runlist;
-        u32 pbdma_bitmask = 0;
+        u32 rlid, pbdma_bitmask = 0;
        if (id_type != ID_TYPE_UNKNOWN) {
                if (id_type == ID_TYPE_TSG)
@@ -641,31 +695,31 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
                if (mmfault->faulted_pbdma != FIFO_INVAL_PBDMA_ID)
                        pbdma_bitmask = BIT(mmfault->faulted_pbdma);
-                for (id = 0; id < f->max_runlists; id++) {
+                for (rlid = 0; rlid < f->max_runlists; rlid++) {
-                        runlist = &f->runlist_info[id];
+                        runlist = &f->runlist_info[rlid];
                        if (runlist->eng_bitmask & act_eng_bitmask)
                                runlists_mask |=
-                                 fifo_sched_disable_runlist_m(id);
+                                 fifo_sched_disable_runlist_m(rlid);
                        if (runlist->pbdma_bitmask & pbdma_bitmask)
                                runlists_mask |=
-                                 fifo_sched_disable_runlist_m(id);
+                                 fifo_sched_disable_runlist_m(rlid);
                }
        }
        if (id_type == ID_TYPE_UNKNOWN) {
-                for (id = 0; id < f->max_runlists; id++) {
+                for (rlid = 0; rlid < f->max_runlists; rlid++) {
                        if (act_eng_bitmask) {
                                /* eng ids are known */
-                                runlist = &f->runlist_info[id];
+                                runlist = &f->runlist_info[rlid];
                                if (runlist->eng_bitmask & act_eng_bitmask)
                                        runlists_mask |=
-                                        fifo_sched_disable_runlist_m(id);
+                                        fifo_sched_disable_runlist_m(rlid);
                        } else {
                                runlists_mask |=
-                                        fifo_sched_disable_runlist_m(id);
+                                        fifo_sched_disable_runlist_m(rlid);
                        }
                }
        }
@@ -697,10 +751,20 @@ static int gv11b_fifo_poll_runlist_preempt_pending(struct gk20a *g,
        struct nvgpu_timeout timeout;
        u32 delay = GR_IDLE_CHECK_DEFAULT;
        int ret = -EBUSY;
+        unsigned int loop_count = 0;
        nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
                           NVGPU_TIMER_CPU_TIMER);
        do {
+                if (!nvgpu_platform_is_silicon(g)) {
+                        if (loop_count >= MAX_PRE_SI_RETRIES) {
+                                nvgpu_err(g, "preempt runlist retries: %u",
+                                        loop_count);
+                                break;
+                        }
+                        loop_count++;
+                }
                if (!((gk20a_readl(g, fifo_runlist_preempt_r())) &
                                 runlists_mask)) {
                        ret = 0;
@@ -710,13 +774,16 @@ static int gv11b_fifo_poll_runlist_preempt_pending(struct gk20a *g,
                nvgpu_usleep_range(delay, delay * 2);
                delay = min_t(unsigned long,
                                delay << 1, GR_IDLE_CHECK_MAX);
-        } while (!nvgpu_timeout_expired_msg(&timeout,
+        } while (!nvgpu_timeout_expired(&timeout));
-                                 "runlist preempt timeout"));
+        if (ret)
+                nvgpu_err(g, "preempt runlist timeout, runlists_mask:0x%08x",
+                                runlists_mask);
        return ret;
 }
 int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
-                 unsigned int id_type, unsigned int timeout_rc_type)
+                 unsigned int id_type)
 {
        struct fifo_gk20a *f = &g->fifo;
        unsigned long runlist_served_pbdmas;
@@ -724,7 +791,6 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
        u32 pbdma_id;
        u32 act_eng_id;
        u32 runlist_id;
-        int func_ret;
        int ret = 0;
        u32 tsgid;
@@ -741,30 +807,14 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
        runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask;
        runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask;
-        for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) {
+        for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma)
+                ret |= gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id);
-                func_ret = gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id,
-                                                         timeout_rc_type);
-                if (func_ret != 0) {
-                        nvgpu_log_info(g, "preempt timeout pbdma %d", pbdma_id);
-                        ret |= func_ret;
-                }
-        }
        f->runlist_info[runlist_id].reset_eng_bitmask = 0;
-        for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) {
+        for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines)
+                ret |= gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
-                func_ret = gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
+                                &f->runlist_info[runlist_id].reset_eng_bitmask);
-                                &f->runlist_info[runlist_id].reset_eng_bitmask,
-                                 timeout_rc_type);
-                if (func_ret != 0) {
-                        nvgpu_log_info(g, "preempt timeout engine %d", act_eng_id);
-                        ret |= func_ret;
-                }
-        }
        return ret;
 }
@@ -848,6 +898,9 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
        nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock);
+        /* WAR for Bug 2065990 */
+        gk20a_fifo_disable_tsg_sched(g, &f->tsg[tsgid]);
        mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
        ret = __locked_fifo_preempt(g, tsgid, true);
@@ -855,6 +908,9 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
        if (!mutex_ret)
                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        /* WAR for Bug 2065990 */
+        gk20a_fifo_enable_tsg_sched(g, &f->tsg[tsgid]);
        nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
        if (ret)
@@ -863,44 +919,36 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
        return ret;
 }
-static int gv11b_fifo_preempt_runlists(struct gk20a *g, u32 runlists_mask)
+static void gv11b_fifo_locked_preempt_runlists(struct gk20a *g, u32 runlists_mask)
 {
        int ret = 0;
        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
        u32 mutex_ret = 0;
-        u32 runlist_id;
+        u32 rlid;
-        nvgpu_log_fn(g, " ");
-        for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
+        /* runlist_lock are locked by teardown and sched are disabled too */
-                if (runlists_mask & fifo_runlist_preempt_runlist_m(runlist_id))
+        nvgpu_log_fn(g, "preempt runlists_mask:0x%08x", runlists_mask);
-                        nvgpu_mutex_acquire(&g->fifo.
-                                runlist_info[runlist_id].runlist_lock);
-        }
        mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
        ret = __locked_fifo_preempt_runlists(g, runlists_mask);
-        if (!mutex_ret)
+        if (ret) {
-                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+                /* if preempt timed out, reset engs served by runlists */
+                for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
-        for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
+                        if (runlists_mask &
-                if (runlists_mask &
+                                fifo_runlist_preempt_runlist_m(rlid))
-                                fifo_runlist_preempt_runlist_m(runlist_id)) {
+                                g->fifo.runlist_info[rlid].reset_eng_bitmask =
-                        /* during recovery reset engs served by this runlist */
+                                g->fifo.runlist_info[rlid].eng_bitmask;
-                        g->fifo.runlist_info[runlist_id].reset_eng_bitmask =
-                                 g->fifo.runlist_info[runlist_id].eng_bitmask;
-                        nvgpu_mutex_release(&g->fifo.
-                                runlist_info[runlist_id].runlist_lock);
                }
        }
-        return ret;
+        if (!mutex_ret)
+                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
 }
 static int __locked_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
-                         unsigned int id_type, unsigned int timeout_rc_type)
+                         unsigned int id_type)
 {
        int ret;
        struct fifo_gk20a *f = &g->fifo;
@@ -914,52 +962,97 @@ static int __locked_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
                gk20a_fifo_issue_preempt(g, id, true);
        /* wait for preempt */
-        ret = g->ops.fifo.is_preempt_pending(g, id, id_type,
+        ret = g->ops.fifo.is_preempt_pending(g, id, id_type);
-                                         timeout_rc_type);
-        if (ret && (timeout_rc_type == PREEMPT_TIMEOUT_RC))
+        /* No recovery even if preempt timed out since
-                gk20a_fifo_preempt_timeout_rc(g, id, id_type);
+         * this is called from recovery path
+         */
        return ret;
 }
 int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
-                         unsigned int id_type, unsigned int timeout_rc_type)
+                         unsigned int id_type)
 {
-        struct fifo_gk20a *f = &g->fifo;
        u32 ret = 0;
        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
        u32 mutex_ret = 0;
-        u32 runlist_id;
-        if (id_type == ID_TYPE_TSG)
+        mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
-                runlist_id = f->tsg[id].runlist_id;
+        /*
-        else if (id_type == ID_TYPE_CHANNEL)
+         * This is called from teardown path only. runlist_lock
-                runlist_id = f->channel[id].runlist_id;
+         * is already acquired before calling this function.
-        else
+         */
-                return -EINVAL;
+        ret = __locked_fifo_preempt_ch_tsg(g, id, id_type);
-        if (runlist_id >= g->fifo.max_runlists) {
+        if (!mutex_ret)
-                nvgpu_log_info(g, "runlist_id = %d", runlist_id);
+                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
-                return -EINVAL;
-        }
-        nvgpu_log_fn(g, "preempt id = %d, runlist_id = %d", id, runlist_id);
+        return ret;
-        nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock);
+}
+static void gv11b_fifo_locked_abort_runlist_active_tsgs(struct gk20a *g,
+                        unsigned int rc_type,
+                        u32 runlists_mask)
+{
+        struct tsg_gk20a *tsg = NULL;
+        u32 rlid, tsgid;
+        struct fifo_runlist_info_gk20a *runlist = NULL;
+        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+        u32 mutex_ret = 0;
+        bool add = false, wait_for_finish = false;
+        int err;
+        nvgpu_err(g, "runlist id unknown, abort active tsgs in runlists");
+        /* runlist_lock  are locked by teardown */
        mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
-        ret = __locked_fifo_preempt_ch_tsg(g, id, id_type, timeout_rc_type);
+        for (rlid = 0; rlid < g->fifo.max_runlists;
+                                                 rlid++) {
+                if (!(runlists_mask & BIT(rlid)))
+                        continue;
+                nvgpu_log(g, gpu_dbg_info, "abort runlist id %d",
+                                rlid);
+                runlist = &g->fifo.runlist_info[rlid];
+                for_each_set_bit(tsgid, runlist->active_tsgs,
+                        g->fifo.num_channels) {
+                        nvgpu_log(g, gpu_dbg_info, "abort tsg id %d", tsgid);
+                        tsg = &g->fifo.tsg[tsgid];
+                        gk20a_disable_tsg(tsg);
-        if (!mutex_ret)
+                        /* assume all pbdma and eng faulted are set */
-                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+                        nvgpu_log(g, gpu_dbg_info, "reset pbdma and eng faulted");
+                        gv11b_reset_pbdma_faulted_tsg(tsg);
+                        gv11b_reset_eng_faulted_tsg(tsg);
-        nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+                        gk20a_ctxsw_trace_tsg_reset(g, tsg);
+#endif
+                        if (!g->fifo.deferred_reset_pending) {
+                                if (rc_type == RC_TYPE_MMU_FAULT) {
+                                        gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
+                                        gk20a_fifo_error_tsg(g, tsg);
+                                }
+                        }
-        return ret;
+                        /* (chid == ~0 && !add) remove all act ch from runlist*/
+                        err = gk20a_fifo_update_runlist_locked(g, rlid,
+                                        FIFO_INVAL_CHANNEL_ID, add, wait_for_finish);
+                        if (err)
+                                nvgpu_err(g, "runlist id %d is not cleaned up",
+                                        rlid);
+                        gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
+                        nvgpu_log(g, gpu_dbg_info, "aborted tsg id %d", tsgid);
+                }
+        }
+        if (!mutex_ret)
+                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
 }
 void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
@@ -967,10 +1060,66 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
                         struct mmu_fault_info *mmfault)
 {
        struct tsg_gk20a *tsg = NULL;
-        struct channel_gk20a *refch = NULL;
+        u32 runlists_mask, rlid;
-        u32 runlists_mask, runlist_id;
        struct fifo_runlist_info_gk20a *runlist = NULL;
        u32 engine_id, client_type = ~0;
+        struct fifo_gk20a *f = &g->fifo;
+        u32 runlist_id = FIFO_INVAL_RUNLIST_ID;
+        u32 num_runlists = 0;
+        nvgpu_log_fn(g, "acquire runlist_lock for all runlists");
+        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++)
+                nvgpu_mutex_acquire(&f->runlist_info[rlid].
+                        runlist_lock);
+        /* get runlist id and tsg */
+        if (id_type == ID_TYPE_TSG) {
+                if (id != FIFO_INVAL_TSG_ID) {
+                        tsg = &g->fifo.tsg[id];
+                        runlist_id = tsg->runlist_id;
+                        if (runlist_id != FIFO_INVAL_RUNLIST_ID)
+                                num_runlists++;
+                        else
+                                nvgpu_log_fn(g, "tsg runlist id is invalid");
+                } else {
+                        nvgpu_log_fn(g, "id type is tsg but tsg id is inval");
+                }
+        } else {
+                /*
+                 * id type is unknown, get runlist_id if eng mask is such that
+                 * it corresponds to single runlist id. If eng mask corresponds
+                 * to multiple runlists, then abort all runlists
+                 */
+                for (rlid = 0; rlid < f->max_runlists; rlid++) {
+                        if (act_eng_bitmask) {
+                                /* eng ids are known */
+                                runlist = &f->runlist_info[rlid];
+                                if (runlist->eng_bitmask & act_eng_bitmask) {
+                                        runlist_id = rlid;
+                                        num_runlists++;
+                                }
+                        } else {
+                                break;
+                        }
+                }
+                if (num_runlists > 1 ) /* abort all runlists */
+                        runlist_id = FIFO_INVAL_RUNLIST_ID;
+        }
+        /* if runlist_id is valid and there is only single runlist to be
+         * aborted, release runlist lock that are not
+         * needed for this recovery
+         */
+        if (runlist_id != FIFO_INVAL_RUNLIST_ID && num_runlists == 1) {
+                for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
+                        if (rlid != runlist_id) {
+                                nvgpu_log_fn(g, "release runlist_lock for "
+                                        "unused runlist id: %d", rlid);
+                                nvgpu_mutex_release(&f->runlist_info[rlid].
+                                        runlist_lock);
+                        }
+                }
+        }
        nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, "
                        "act_eng_bitmask = 0x%x, mmfault ptr = 0x%p",
@@ -979,6 +1128,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
        runlists_mask =  gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id,
                                         id_type, rc_type, mmfault);
+        /* Disable runlist scheduler */
        gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED);
        g->fifo.deferred_reset_pending = false;
@@ -1000,41 +1150,41 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
        gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);
-        /* Get tsg/ch */
        if (rc_type == RC_TYPE_MMU_FAULT) {
                gk20a_debug_dump(g);
-                refch = mmfault->refch;
                client_type = mmfault->client_type;
-                gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch,
+                gv11b_fifo_reset_pbdma_and_eng_faulted(g, tsg,
-                        mmfault->faulted_pbdma,
+                                mmfault->faulted_pbdma,
-                        mmfault->faulted_engine);
+                                mmfault->faulted_engine);
        }
-        if (id_type == ID_TYPE_TSG) {
-                tsg = &g->fifo.tsg[id];
-        } else if (id_type == ID_TYPE_CHANNEL) {
-                if (refch == NULL)
-                        refch = gk20a_channel_get(&g->fifo.channel[id]);
-        }
-        /* Disable tsg/ch */
        if (tsg)
                gk20a_disable_tsg(tsg);
-        else if (refch)
-                g->ops.fifo.disable_channel(refch);
-        /* Preempt tsg/ch */
+        /*
-        if (id_type == ID_TYPE_TSG || id_type == ID_TYPE_CHANNEL) {
+         * Even though TSG preempt timed out, the RC sequence would by design
-                g->ops.fifo.preempt_ch_tsg(g, id, id_type,
+         * require s/w to issue another preempt.
-                                         PREEMPT_TIMEOUT_NORC);
+         * If recovery includes an ENGINE_RESET, to not have race conditions,
+         * use RUNLIST_PREEMPT to kick all work off, and cancel any context
+         * load which may be pending. This is also needed to make sure
+         * that all PBDMAs serving the engine are not loaded when engine is
+         * reset.
+         */
+        if (tsg) {
+                int preempt_failed;
+                preempt_failed = g->ops.fifo.preempt_ch_tsg(g, id, id_type);
+                if (preempt_failed)
+                        gv11b_fifo_locked_preempt_runlists(g, runlists_mask);
        } else {
-                gv11b_fifo_preempt_runlists(g, runlists_mask);
+                gv11b_fifo_locked_preempt_runlists(g, runlists_mask);
        }
        /* check if engine reset should be deferred */
-        for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
+        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
-                runlist = &g->fifo.runlist_info[runlist_id];
+                runlist = &g->fifo.runlist_info[rlid];
-                if ((runlists_mask & BIT(runlist_id)) &&
+                if ((runlists_mask & BIT(rlid)) &&
                                        runlist->reset_eng_bitmask) {
                        unsigned long __reset_eng_bitmask =
@@ -1042,7 +1192,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
                        for_each_set_bit(engine_id, &__reset_eng_bitmask,
                                                        g->fifo.max_engines) {
-                                if ((refch || tsg) &&
+                                if (tsg &&
                                         gk20a_fifo_should_defer_engine_reset(g,
                                        engine_id, client_type, false)) {
@@ -1074,13 +1224,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
        }
 #ifdef CONFIG_GK20A_CTXSW_TRACE
-        /* tsg and refch both could be valid for mmu fault. Check tsg first */
        if (tsg)
                gk20a_ctxsw_trace_tsg_reset(g, tsg);
-        else if (refch)
-                gk20a_ctxsw_trace_channel_reset(g, refch);
 #endif
        if (tsg) {
                if (g->fifo.deferred_reset_pending) {
                        gk20a_disable_tsg(tsg);
@@ -1090,26 +1236,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
                        gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
                }
-                if (refch)
-                        gk20a_channel_put(refch);
-        } else if (refch) {
-                if (g->fifo.deferred_reset_pending) {
-                        g->ops.fifo.disable_channel(refch);
-                } else {
-                        if (rc_type == RC_TYPE_MMU_FAULT)
-                                gk20a_fifo_set_ctx_mmu_error_ch(g, refch);
-                        gk20a_channel_abort(refch, false);
-                }
-                gk20a_channel_put(refch);
        } else {
-                nvgpu_err(g, "id unknown, abort runlist");
+                gv11b_fifo_locked_abort_runlist_active_tsgs(g, rc_type,
-                for (runlist_id = 0; runlist_id < g->fifo.max_runlists;
+                        runlists_mask);
-                                                 runlist_id++) {
-                        if (runlists_mask & BIT(runlist_id))
-                                g->ops.fifo.update_runlist(g, runlist_id,
-                                         FIFO_INVAL_CHANNEL_ID, false, true);
-                }
        }
        gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED);
@@ -1117,6 +1246,18 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
        /* It is safe to enable ELPG again. */
        if (g->support_pmu && g->elpg_enabled)
                nvgpu_pmu_enable_elpg(g);
+        /* release runlist_lock */
+        if (runlist_id != FIFO_INVAL_RUNLIST_ID) {
+                nvgpu_log_fn(g, "release runlist_lock runlist_id = %d",
+                                runlist_id);
+                nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
+        } else {
+                nvgpu_log_fn(g, "release runlist_lock for all runlists");
+                for (rlid = 0; rlid < g->fifo.max_runlists; rlid++)
+                        nvgpu_mutex_release(&f->runlist_info[rlid].
+                                runlist_lock);
+        }
 }
 void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f)
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
index 1ae3c93e..aee7aef2 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
@@ -50,10 +50,13 @@
 #define CHANNEL_INFO_VEID0              0
+#define MAX_PRE_SI_RETRIES              200000  /* 1G/500KHz * 100 */
+#define PREEMPT_TIMEOUT_1000_MS         1000
 struct gpu_ops;
 void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
-                        struct channel_gk20a *refch,
+                        struct tsg_gk20a *tsg,
                        u32 faulted_pbdma, u32 faulted_engine);
 void gv11b_mmu_fault_id_to_eng_pbdma_id_and_veid(struct gk20a *g,
        u32 mmu_fault_id, u32 *active_engine_id, u32 *veid, u32 *pbdma_id);
@@ -78,12 +81,11 @@ void gv11b_dump_eng_status(struct gk20a *g,
 u32 gv11b_fifo_intr_0_error_mask(struct gk20a *g);
 int gv11b_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next);
 int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
-                 unsigned int id_type, unsigned int timeout_rc_type);
+                 unsigned int id_type);
 int gv11b_fifo_preempt_channel(struct gk20a *g, u32 chid);
 int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid);
 int gv11b_fifo_enable_tsg(struct tsg_gk20a *tsg);
-int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
+int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id, unsigned int id_type);
-                         unsigned int id_type, unsigned int timeout_rc_type);
 void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
                        u32 id, unsigned int id_type, unsigned int rc_type,
                         struct mmu_fault_info *mmfault);
diff --git a/drivers/gpu/nvgpu/gv11b/mc_gv11b.c b/drivers/gpu/nvgpu/gv11b/mc_gv11b.c
index 64680fc6..bc802c2d 100644
--- a/drivers/gpu/nvgpu/gv11b/mc_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/mc_gv11b.c
@@ -66,24 +66,24 @@ bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0)
        return (((mc_intr_0 & mc_intr_hub_pending_f()) != 0U) ? true : false);
 }
-bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id)
+bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
+                        u32 *eng_intr_pending)
 {
        u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0));
        u32 stall_intr, eng_intr_mask;
        eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id);
-        if ((mc_intr_0 & eng_intr_mask) != 0U) {
+        *eng_intr_pending = mc_intr_0 & eng_intr_mask;
-                return true;
-        }
        stall_intr = mc_intr_pfifo_pending_f() |
                        mc_intr_hub_pending_f() |
                        mc_intr_priv_ring_pending_f() |
                        mc_intr_pbus_pending_f() |
                        mc_intr_ltc_pending_f();
-        if ((mc_intr_0 & stall_intr) != 0U) {
-                return true;
-        }
-        return false;
+        nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
+                "mc_intr_0 = 0x%08x, eng_intr = 0x%08x",
+                mc_intr_0 & stall_intr, *eng_intr_pending);
+        return (mc_intr_0 & (eng_intr_mask | stall_intr)) != 0U;
 }
diff --git a/drivers/gpu/nvgpu/gv11b/mc_gv11b.h b/drivers/gpu/nvgpu/gv11b/mc_gv11b.h
index eb9d0e4e..faa4d38d 100644
--- a/drivers/gpu/nvgpu/gv11b/mc_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/mc_gv11b.h
@@ -26,5 +26,6 @@ struct gk20a;
 void mc_gv11b_intr_enable(struct gk20a *g);
 bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0);
-bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id);
+bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
+                        u32 *eng_intr_pending);
 #endif