From 1407133b7e1b27a92ee8c116009541904d2ff691 Mon Sep 17 00:00:00 2001 From: Seema Khowala Date: Thu, 12 Apr 2018 16:09:43 -0700 Subject: gpu: nvgpu: gv11b: do not poll preempt done if eng intr pending -During polling eng preempt done, reset eng only if eng stall intr is pending. Also stop polling for eng preempt done if eng intr is pending. -Add max retries for pre-si platforms for poll pbdma and eng preempt done polling loops. Bug 2125776 Bug 2108544 Bug 2105322 Bug 2092051 Bug 2048824 Bug 2043838 Bug 2039587 Bug 2028993 Bug 2029245 Bug 2065990 Bug 1945121 Bug 200401707 Bug 200393631 Bug 200327596 Change-Id: I66b07be9647f141bd03801f83e3cda797e88272f Signed-off-by: Seema Khowala Reviewed-on: https://git-master.nvidia.com/r/1694137 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/gk20a.h | 2 +- drivers/gpu/nvgpu/gv100/mc_gv100.c | 16 ++--- drivers/gpu/nvgpu/gv100/mc_gv100.h | 3 +- drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | 113 ++++++++++++++++++++++++----------- drivers/gpu/nvgpu/gv11b/fifo_gv11b.h | 2 + drivers/gpu/nvgpu/gv11b/mc_gv11b.c | 16 ++--- drivers/gpu/nvgpu/gv11b/mc_gv11b.h | 3 +- 7 files changed, 101 insertions(+), 54 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index 9061236e..25146b8b 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -1109,7 +1109,7 @@ struct gpu_ops { bool (*is_intr_hub_pending)(struct gk20a *g, u32 mc_intr); bool (*is_intr_nvlink_pending)(struct gk20a *g, u32 mc_intr); bool (*is_stall_and_eng_intr_pending)(struct gk20a *g, - u32 act_eng_id); + u32 act_eng_id, u32 *eng_intr_pending); u32 (*intr_stall)(struct gk20a *g); void (*intr_stall_pause)(struct gk20a *g); void (*intr_stall_resume)(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/gv100/mc_gv100.c b/drivers/gpu/nvgpu/gv100/mc_gv100.c index 7ed9e6da..2d84a3a8 100644 --- a/drivers/gpu/nvgpu/gv100/mc_gv100.c +++ b/drivers/gpu/nvgpu/gv100/mc_gv100.c @@ -72,15 +72,14 @@ bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0) return (((mc_intr_0 & mc_intr_nvlink_pending_f()) != 0U) ? true : false); } -bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id) +bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id, + u32 *eng_intr_pending) { u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0)); u32 stall_intr, eng_intr_mask; eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id); - if ((mc_intr_0 & eng_intr_mask) != 0U) { - return true; - } + *eng_intr_pending = mc_intr_0 & eng_intr_mask; stall_intr = mc_intr_pfifo_pending_f() | mc_intr_hub_pending_f() | @@ -88,9 +87,10 @@ bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id) mc_intr_pbus_pending_f() | mc_intr_ltc_pending_f() | mc_intr_nvlink_pending_f(); - if ((mc_intr_0 & stall_intr) != 0U) { - return true; - } - return false; + nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, + "mc_intr_0 = 0x%08x, eng_intr = 0x%08x", + mc_intr_0 & stall_intr, *eng_intr_pending); + + return (mc_intr_0 & (eng_intr_mask | stall_intr)) != 0U; } diff --git a/drivers/gpu/nvgpu/gv100/mc_gv100.h b/drivers/gpu/nvgpu/gv100/mc_gv100.h index 4aff4a36..e9069258 100644 --- a/drivers/gpu/nvgpu/gv100/mc_gv100.h +++ b/drivers/gpu/nvgpu/gv100/mc_gv100.h @@ -26,5 +26,6 @@ struct gk20a; void mc_gv100_intr_enable(struct gk20a *g); bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0); -bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id); +bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id, + u32 *eng_intr_pending); #endif diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index 7e0ce4c6..13d498a7 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c @@ -392,6 +392,7 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id, u32 pbdma_stat; u32 chan_stat; int ret = -EBUSY; + unsigned int loop_count = 0; /* timeout in milli seconds */ nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), @@ -400,6 +401,14 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id, nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id); /* Verify that ch/tsg is no longer on the pbdma */ do { + if (!nvgpu_platform_is_silicon(g)) { + if (loop_count >= MAX_PRE_SI_RETRIES) { + nvgpu_err(g, "preempt pbdma retries: %u", + loop_count); + break; + } + loop_count++; + } /* * If the PBDMA has a stalling interrupt and receives a NACK, * the PBDMA won't save out until the STALLING interrupt is @@ -452,8 +461,11 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id, nvgpu_usleep_range(delay, delay * 2); delay = min_t(unsigned long, delay << 1, GR_IDLE_CHECK_MAX); - } while (!nvgpu_timeout_expired_msg(&timeout, - "preempt timeout pbdma")); + } while (!nvgpu_timeout_expired(&timeout)); + + if (ret) + nvgpu_err(g, "preempt timeout pbdma: %u pbdma_stat: %u " + "tsgid: %u", pbdma_id, pbdma_stat, id); return ret; } @@ -466,7 +478,8 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, u32 eng_stat; u32 ctx_stat; int ret = -EBUSY; - bool stall_intr = false; + unsigned int loop_count = 0; + u32 eng_intr_pending; /* timeout in milli seconds */ nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), @@ -476,20 +489,56 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, act_eng_id); /* Check if ch/tsg has saved off the engine or if ctxsw is hung */ do { + if (!nvgpu_platform_is_silicon(g)) { + if (loop_count >= MAX_PRE_SI_RETRIES) { + nvgpu_err(g, "preempt eng retries: %u", + loop_count); + break; + } + loop_count++; + } eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id)); ctx_stat = fifo_engine_status_ctx_status_v(eng_stat); - if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id)) { - stall_intr = true; + if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id, + &eng_intr_pending)) { + /* From h/w team + * Engine save can be blocked by eng stalling interrupts. + * FIFO interrupts shouldn’t block an engine save from + * finishing, but could block FIFO from reporting preempt done. + * No immediate reason to reset the engine if FIFO interrupt is + * pending. + * The hub, priv_ring, and ltc interrupts could block context + * switch (or memory), but doesn’t necessarily have to. + * For Hub interrupts they just report access counters and page + * faults. Neither of these necessarily block context switch + * or preemption, but they could. + * For example a page fault for graphics would prevent graphics + * from saving out. An access counter interrupt is a + * notification and has no effect. + * SW should handle page faults though for preempt to complete. + * PRI interrupt (due to a failed PRI transaction) will result + * in ctxsw failure reported to HOST. + * LTC interrupts are generally ECC related and if so, + * certainly don’t block preemption/ctxsw but they could. + * Bus interrupts shouldn’t have anything to do with preemption + * state as they are part of the Host EXT pipe, though they may + * exhibit a symptom that indicates that GPU is in a bad state. + * To be completely fair, when an engine is preempting SW + * really should just handle other interrupts as they come in. + * It’s generally bad to just poll and wait on a preempt + * to complete since there are many things in the GPU which may + * cause a system to hang/stop responding. + */ nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, "stall intr set, " - "preemption will not finish"); + "preemption might not finish"); } if (ctx_stat == fifo_engine_status_ctx_status_ctxsw_switch_v()) { /* Eng save hasn't started yet. Continue polling */ - if (stall_intr) { - /* if stall intr stop polling */ + if (eng_intr_pending) { + /* if eng intr, stop polling */ *reset_eng_bitmask |= BIT(act_eng_id); ret = 0; break; @@ -501,8 +550,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, fifo_engine_status_ctx_status_ctxsw_save_v()) { if (id == fifo_engine_status_id_v(eng_stat)) { - if (stall_intr || - timeout_rc_type == PREEMPT_TIMEOUT_NORC) { + if (eng_intr_pending) { /* preemption will not finish */ *reset_eng_bitmask |= BIT(act_eng_id); ret = 0; @@ -518,9 +566,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, fifo_engine_status_ctx_status_ctxsw_load_v()) { if (id == fifo_engine_status_next_id_v(eng_stat)) { - - if (stall_intr || - timeout_rc_type == PREEMPT_TIMEOUT_NORC) { + if (eng_intr_pending) { /* preemption will not finish */ *reset_eng_bitmask |= BIT(act_eng_id); ret = 0; @@ -540,8 +586,21 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, nvgpu_usleep_range(delay, delay * 2); delay = min_t(unsigned long, delay << 1, GR_IDLE_CHECK_MAX); - } while (!nvgpu_timeout_expired_msg(&timeout, - "preempt timeout eng")); + } while (!nvgpu_timeout_expired(&timeout)); + + if (ret) { + /* + * The reasons a preempt can fail are: + * 1.Some other stalling interrupt is asserted preventing + * channel or context save. + * 2.The memory system hangs. + * 3.The engine hangs during CTXSW. + */ + nvgpu_err(g, "preempt timeout eng: %u ctx_stat: %u tsgid: %u", + act_eng_id, ctx_stat, id); + *reset_eng_bitmask |= BIT(act_eng_id); + } + return ret; } @@ -718,7 +777,6 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, u32 pbdma_id; u32 act_eng_id; u32 runlist_id; - int func_ret; int ret = 0; u32 tsgid; @@ -735,30 +793,15 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask; runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask; - for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) { - - func_ret = gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id, + for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) + ret |= gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id, timeout_rc_type); - if (func_ret != 0) { - nvgpu_log_info(g, "preempt timeout pbdma %d", pbdma_id); - ret |= func_ret; - } - } - f->runlist_info[runlist_id].reset_eng_bitmask = 0; - for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) { - - func_ret = gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id, + for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) + ret |= gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id, &f->runlist_info[runlist_id].reset_eng_bitmask, timeout_rc_type); - - if (func_ret != 0) { - nvgpu_log_info(g, "preempt timeout engine %d", act_eng_id); - ret |= func_ret; - } - } - return ret; } diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h index 1ae3c93e..5ff16453 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h @@ -50,6 +50,8 @@ #define CHANNEL_INFO_VEID0 0 +#define MAX_PRE_SI_RETRIES 200000 /* 1G/500KHz * 100 */ + struct gpu_ops; void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/gv11b/mc_gv11b.c b/drivers/gpu/nvgpu/gv11b/mc_gv11b.c index 31600828..dbeb0645 100644 --- a/drivers/gpu/nvgpu/gv11b/mc_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/mc_gv11b.c @@ -71,24 +71,24 @@ bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0) return (((mc_intr_0 & mc_intr_hub_pending_f()) != 0U) ? true : false); } -bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id) +bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id, + u32 *eng_intr_pending) { u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0)); u32 stall_intr, eng_intr_mask; eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id); - if ((mc_intr_0 & eng_intr_mask) != 0U) { - return true; - } + *eng_intr_pending = mc_intr_0 & eng_intr_mask; stall_intr = mc_intr_pfifo_pending_f() | mc_intr_hub_pending_f() | mc_intr_priv_ring_pending_f() | mc_intr_pbus_pending_f() | mc_intr_ltc_pending_f(); - if ((mc_intr_0 & stall_intr) != 0U) { - return true; - } - return false; + nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, + "mc_intr_0 = 0x%08x, eng_intr = 0x%08x", + mc_intr_0 & stall_intr, *eng_intr_pending); + + return (mc_intr_0 & (eng_intr_mask | stall_intr)) != 0U; } diff --git a/drivers/gpu/nvgpu/gv11b/mc_gv11b.h b/drivers/gpu/nvgpu/gv11b/mc_gv11b.h index eb9d0e4e..faa4d38d 100644 --- a/drivers/gpu/nvgpu/gv11b/mc_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/mc_gv11b.h @@ -26,5 +26,6 @@ struct gk20a; void mc_gv11b_intr_enable(struct gk20a *g); bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0); -bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id); +bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id, + u32 *eng_intr_pending); #endif -- cgit v1.2.2