From 5286fd525731d19dfa07d5e6e49e8d0eef233531 Mon Sep 17 00:00:00 2001 From: Thomas Fleury Date: Wed, 17 Aug 2016 17:26:30 -0700 Subject: gpu: nvgpu: fix ctxsw timeout handling for TSGs While collecting failing engine data, id type (is_tsg) was not set for ctxsw and save engine states. This could result in some ctxsw timeout interrupts to be ignored (id reported with wrong is_tsg). For TSGs, check if we made some progress on any of the channels before kicking fifo recovery. Bug 200228310 Jira EVLR-597 Change-Id: I231549ae68317919532de0f87effb78ee9c119c6 Signed-off-by: Thomas Fleury Reviewed-on: http://git-master/r/1204035 (cherry picked from commit 7221d256fd7e9b418f7789b3d81eede8faa16f0b) Reviewed-on: http://git-master/r/1204037 Reviewed-by: Richard Zhao GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom --- drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 5 +- drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 2 +- drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 156 +++++++++++++++++++++++++------- 3 files changed, 127 insertions(+), 36 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 41fced99..d4cf6915 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c @@ -1539,16 +1539,19 @@ static inline u32 gp_free_count(struct channel_gk20a *c) } bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, - u32 timeout_delta_ms) + u32 timeout_delta_ms, bool *progress) { u32 gpfifo_get = update_gp_get(ch->g, ch); + /* Count consequent timeout isr */ if (gpfifo_get == ch->timeout_gpfifo_get) { /* we didn't advance since previous channel timeout check */ ch->timeout_accumulated_ms += timeout_delta_ms; + *progress = false; } else { /* first timeout isr encountered */ ch->timeout_accumulated_ms = timeout_delta_ms; + *progress = true; } ch->timeout_gpfifo_get = gpfifo_get; diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index 971175f2..6469603b 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h @@ -218,7 +218,7 @@ int gk20a_init_channel_support(struct gk20a *, u32 chid); void gk20a_channel_close(struct channel_gk20a *ch); bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, - u32 timeout_delta_ms); + u32 timeout_delta_ms, bool *progress); void gk20a_disable_channel(struct channel_gk20a *ch); void gk20a_channel_abort(struct channel_gk20a *ch, bool channel_preempt); void gk20a_channel_abort_clean_up(struct channel_gk20a *ch); diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index bd31656f..c18c7c94 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c @@ -1814,17 +1814,24 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, if (ctx_status == fifo_engine_status_ctx_status_ctxsw_load_v()) { id = fifo_engine_status_next_id_v(status); - is_tsg = fifo_pbdma_status_id_type_v(status) - != fifo_pbdma_status_id_type_chid_v(); + is_tsg = fifo_engine_status_next_id_type_v(status) != + fifo_engine_status_next_id_type_chid_v(); } else if (ctx_status == fifo_engine_status_ctx_status_ctxsw_switch_v()) { mailbox2 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(2)); - if (mailbox2 & FECS_METHOD_WFI_RESTORE) + if (mailbox2 & FECS_METHOD_WFI_RESTORE) { id = fifo_engine_status_next_id_v(status); - else + is_tsg = fifo_engine_status_next_id_type_v(status) != + fifo_engine_status_next_id_type_chid_v(); + } else { id = fifo_engine_status_id_v(status); + is_tsg = fifo_engine_status_id_type_v(status) != + fifo_engine_status_id_type_chid_v(); + } } else { id = fifo_engine_status_id_v(status); + is_tsg = fifo_engine_status_id_type_v(status) != + fifo_engine_status_id_type_chid_v(); } break; } @@ -1835,6 +1842,97 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, return active_engine_id; } +static bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch, + bool *verbose, u32 *ms) +{ + bool recover = false; + bool progress = false; + + if (gk20a_channel_get(ch)) { + recover = gk20a_channel_update_and_check_timeout(ch, + GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000, + &progress); + *verbose = ch->timeout_debug_dump; + *ms = ch->timeout_accumulated_ms; + if (recover) + gk20a_set_error_notifier(ch, + NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT); + + gk20a_channel_put(ch); + } + return recover; +} + +static bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg, + bool *verbose, u32 *ms) +{ + struct channel_gk20a *ch; + bool recover = false; + bool progress = false; + + *verbose = false; + *ms = GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000; + + mutex_lock(&tsg->ch_list_lock); + + /* check if there was some progress on any of the TSG channels. + * fifo recovery is needed if at least one channel reached the + * maximum timeout without progress (update in gpfifo pointers). + */ + list_for_each_entry(ch, &tsg->ch_list, ch_entry) { + if (gk20a_channel_get(ch)) { + recover = gk20a_channel_update_and_check_timeout(ch, + *ms, &progress); + if (progress || recover) + break; + gk20a_channel_put(ch); + } + } + + /* if at least one channel in the TSG made some progress, reset + * accumulated timeout for all channels in the TSG. In particular, + * this resets timeout for channels that already completed their work + */ + if (progress) { + gk20a_dbg_info("progress on tsg=%d ch=%d", + tsg->tsgid, ch->hw_chid); + gk20a_channel_put(ch); + *ms = GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000; + list_for_each_entry(ch, &tsg->ch_list, ch_entry) { + if (gk20a_channel_get(ch)) { + ch->timeout_accumulated_ms = *ms; + gk20a_channel_put(ch); + } + } + } + + /* if one channel is presumed dead (no progress for too long), then + * fifo recovery is needed. we can't really figure out which channel + * caused the problem, so set timeout error notifier for all channels. + */ + if (recover) { + gk20a_dbg_info("timeout on tsg=%d ch=%d", + tsg->tsgid, ch->hw_chid); + *ms = ch->timeout_accumulated_ms; + gk20a_channel_put(ch); + list_for_each_entry(ch, &tsg->ch_list, ch_entry) { + if (gk20a_channel_get(ch)) { + gk20a_set_error_notifier(ch, + NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT); + *verbose |= ch->timeout_debug_dump; + gk20a_channel_put(ch); + } + } + } + + /* if we could not detect progress on any of the channel, but none + * of them has reached the timeout, there is nothing more to do: + * timeout_accumulated_ms has been updated for all of them. + */ + mutex_unlock(&tsg->ch_list_lock); + return recover; +} + static bool gk20a_fifo_handle_sched_error(struct gk20a *g) { u32 sched_error; @@ -1859,50 +1957,40 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g) if (fifo_intr_sched_error_code_f(sched_error) == fifo_intr_sched_error_code_ctxsw_timeout_v()) { struct fifo_gk20a *f = &g->fifo; - struct channel_gk20a *ch = &f->channel[id]; + u32 ms = 0; + bool verbose = false; if (is_tsg) { - gk20a_channel_timeout_restart_all_channels(g); - gk20a_fifo_recover(g, BIT(engine_id), id, true, - true, true); - ret = true; - goto err; + ret = gk20a_fifo_check_tsg_ctxsw_timeout( + &f->tsg[id], &verbose, &ms); + } else { + ret = gk20a_fifo_check_ch_ctxsw_timeout( + &f->channel[id], &verbose, &ms); } - if (!gk20a_channel_get(ch)) - goto err; - - if (gk20a_channel_update_and_check_timeout(ch, - GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) { - gk20a_set_error_notifier(ch, - NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT); + if (ret) { gk20a_err(dev_from_gk20a(g), - "fifo sched ctxsw timeout error:" - "engine = %u, ch = %d", engine_id, id); - gk20a_gr_debug_dump(g->dev); + "fifo sched ctxsw timeout error: " + "engine=%u, %s=%d, ms=%u", + engine_id, is_tsg ? "tsg" : "ch", id, ms); /* * Cancel all channels' timeout since SCHED error might * trigger multiple watchdogs at a time */ gk20a_channel_timeout_restart_all_channels(g); - gk20a_fifo_recover(g, BIT(engine_id), id, false, - true, ch->timeout_debug_dump); - ret = true; + gk20a_fifo_recover(g, BIT(engine_id), id, + is_tsg, true, verbose); } else { gk20a_dbg_info( - "fifo is waiting for ctx switch for %d ms," - "ch = %d\n", - ch->timeout_accumulated_ms, - id); - ret = false; + "fifo is waiting for ctx switch for %d ms, " + "%s=%d", ms, is_tsg ? "tsg" : "ch", id); } - gk20a_channel_put(ch); - return ret; + } else { + gk20a_err(dev_from_gk20a(g), + "fifo sched error : 0x%08x, engine=%u, %s=%d", + sched_error, engine_id, is_tsg ? "tsg" : "ch", id); } - gk20a_err(dev_from_gk20a(g), "fifo sched error : 0x%08x, engine=%u, %s=%d", - sched_error, engine_id, is_tsg ? "tsg" : "ch", id); - err: return ret; } @@ -1913,7 +2001,7 @@ static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr) struct device *dev = dev_from_gk20a(g); u32 handled = 0; - gk20a_dbg_fn(""); + gk20a_dbg_fn("fifo_intr=0x%08x", fifo_intr); if (fifo_intr & fifo_intr_0_pio_error_pending_f()) { /* pio mode is unused. this shouldn't happen, ever. */ -- cgit v1.2.2