From 613990cb391c74436384d63d12240221565011d5 Mon Sep 17 00:00:00 2001 From: Deepak Nibade Date: Mon, 31 Aug 2015 14:30:35 +0530 Subject: gpu: nvgpu: implement per-channel watchdog Implement per-channel watchdog/timer as per below rules : - start the timer while submitting first job on channel or if no timer is already running - cancel the timer when job completes - re-start the timer if there is any incomplete job left in the channel's queue - trigger appropriate recovery method as part of timeout handling mechanism Handle the timeout as per below : - get timed out channel, and job data - disable activity on all engines - check if fence is really pending - get information on failing engine - if no engine is failing, just abort the channel - if engine is failing, trigger the recovery Also, add flag "ch_wdt_enabled" to enable/disable channel watchdog mechanism. Watchdog can also be disabled using global flag "timeouts_enabled" Set the watchdog time to be 5s using macro NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS Bug 200133289 Change-Id: I401cf14dd34a210bc429f31bd5216a361edf1237 Signed-off-by: Deepak Nibade Reviewed-on: http://git-master/r/797072 Reviewed-by: Terje Bergstrom Tested-by: Terje Bergstrom --- drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 124 +++++++++++++++++++++++++++++++- drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 11 +++ drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 6 +- drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | 5 ++ drivers/gpu/nvgpu/gk20a/gk20a.c | 2 + drivers/gpu/nvgpu/gk20a/gk20a.h | 3 + 6 files changed, 147 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index c18a4e5d..2dc8e9a0 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c @@ -1472,6 +1472,14 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, ch->timeout_accumulated_ms > ch->timeout_ms_max; } +static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch) +{ + if (ch->g->timeouts_enabled && ch->g->ch_wdt_enabled) + return NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS; + else + return (u32)MAX_SCHEDULE_TIMEOUT; +} + static u32 get_gp_free_count(struct channel_gk20a *c) { update_gp_get(c->g, c); @@ -1527,6 +1535,112 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c, } } +static void gk20a_channel_timeout_start(struct channel_gk20a *ch, + struct channel_gk20a_job *job) +{ + mutex_lock(&ch->timeout.lock); + + if (ch->timeout.initialized) { + mutex_unlock(&ch->timeout.lock); + return; + } + + ch->timeout.job = job; + ch->timeout.initialized = true; + schedule_delayed_work(&ch->timeout.wq, + msecs_to_jiffies(gk20a_get_channel_watchdog_timeout(ch))); + + mutex_unlock(&ch->timeout.lock); +} + +static void gk20a_channel_timeout_stop(struct channel_gk20a *ch) +{ + mutex_lock(&ch->timeout.lock); + + if (!ch->timeout.initialized) { + mutex_unlock(&ch->timeout.lock); + return; + } + + ch->timeout.initialized = false; + cancel_delayed_work_sync(&ch->timeout.wq); + + mutex_unlock(&ch->timeout.lock); +} + +static void gk20a_channel_timeout_handler(struct work_struct *work) +{ + struct channel_gk20a_job *job; + struct gk20a *g; + struct channel_gk20a *ch; + struct channel_gk20a *failing_ch; + u32 engine_id; + int id = -1; + bool is_tsg = false; + + ch = container_of(to_delayed_work(work), struct channel_gk20a, + timeout.wq); + ch = gk20a_channel_get(ch); + if (!ch) + return; + + g = ch->g; + + /* Need global lock since multiple channels can timeout at a time */ + mutex_lock(&g->ch_wdt_lock); + + /* Get timed out job and reset the timer */ + mutex_lock(&ch->timeout.lock); + job = ch->timeout.job; + ch->timeout.initialized = false; + mutex_unlock(&ch->timeout.lock); + + if (gk20a_fifo_disable_all_engine_activity(g, true)) + goto fail_unlock; + + if (gk20a_fence_is_expired(job->post_fence)) + goto fail_enable_engine_activity; + + gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out\n", + ch->hw_chid); + + /* Get failing engine data */ + engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg); + + if (engine_id >= g->fifo.max_engines) { + /* If no failing engine, abort the channels */ + if (gk20a_is_channel_marked_as_tsg(ch)) { + struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid]; + + gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg); + gk20a_fifo_abort_tsg(g, ch->tsgid); + } else { + gk20a_fifo_set_ctx_mmu_error_ch(g, ch); + gk20a_channel_abort(ch); + } + } else { + /* If failing engine, trigger recovery */ + failing_ch = gk20a_channel_get(&g->fifo.channel[id]); + if (!failing_ch) + goto fail_enable_engine_activity; + + if (failing_ch->hw_chid != ch->hw_chid) + gk20a_channel_timeout_start(ch, job); + + gk20a_fifo_recover(g, BIT(engine_id), + failing_ch->hw_chid, is_tsg, + true, failing_ch->timeout_debug_dump); + + gk20a_channel_put(failing_ch); + } + +fail_enable_engine_activity: + gk20a_fifo_enable_all_engine_activity(g); +fail_unlock: + mutex_unlock(&g->ch_wdt_lock); + gk20a_channel_put(ch); +} + static int gk20a_channel_add_job(struct channel_gk20a *c, struct gk20a_fence *pre_fence, struct gk20a_fence *post_fence) @@ -1561,6 +1675,8 @@ static int gk20a_channel_add_job(struct channel_gk20a *c, job->pre_fence = gk20a_fence_get(pre_fence); job->post_fence = gk20a_fence_get(post_fence); + gk20a_channel_timeout_start(c, job); + mutex_lock(&c->jobs_lock); list_add_tail(&job->list, &c->jobs); mutex_unlock(&c->jobs_lock); @@ -1586,8 +1702,12 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed) struct gk20a *g = c->g; bool completed = gk20a_fence_is_expired(job->post_fence); - if (!completed) + if (!completed) { + gk20a_channel_timeout_start(c, job); break; + } + + gk20a_channel_timeout_stop(c); if (c->sync) c->sync->signal_timeline(c->sync); @@ -1926,6 +2046,8 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid) mutex_init(&c->ioctl_lock); mutex_init(&c->jobs_lock); mutex_init(&c->submit_lock); + mutex_init(&c->timeout.lock); + INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler); INIT_LIST_HEAD(&c->jobs); #if defined(CONFIG_GK20A_CYCLE_STATS) mutex_init(&c->cyclestate.cyclestate_buffer_mutex); diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index 2ea5b4be..70930291 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h @@ -38,6 +38,8 @@ struct gk20a_fence; #include "gr_gk20a.h" #include "fence_gk20a.h" +#define NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS 5000 + struct gpfifo { u32 entry0; u32 entry1; @@ -70,6 +72,13 @@ struct channel_gk20a_job { struct list_head list; }; +struct channel_gk20a_timeout { + struct delayed_work wq; + struct mutex lock; + bool initialized; + struct channel_gk20a_job *job; +}; + struct channel_gk20a_poll_events { struct mutex lock; bool events_enabled; @@ -126,6 +135,8 @@ struct channel_gk20a { u32 timeout_accumulated_ms; u32 timeout_gpfifo_get; + struct channel_gk20a_timeout timeout; + bool cmds_pending; struct { /* These fences should be accessed with submit_lock held. */ diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index 069ea82a..f736fe8c 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c @@ -852,7 +852,7 @@ static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g, return verbose; } -static bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g, +bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g, struct channel_gk20a *ch) { gk20a_err(dev_from_gk20a(g), @@ -861,7 +861,7 @@ static bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g, return gk20a_fifo_set_ctx_mmu_error(g, ch); } -static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, +bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, struct tsg_gk20a *tsg) { bool ret = true; @@ -883,7 +883,7 @@ static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, return ret; } -static void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid) +void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid) { struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; struct channel_gk20a *ch; diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 929b5c82..3f9fac54 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h @@ -183,5 +183,10 @@ u32 gk20a_fifo_engine_interrupt_mask(struct gk20a *g); u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g); u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, int *__id, bool *__is_tsg); +bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, + struct tsg_gk20a *tsg); +void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid); +bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g, + struct channel_gk20a *ch); #endif /*__GR_GK20A_H__*/ diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c index c0889571..fb8b8b14 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gk20a.c @@ -667,6 +667,7 @@ static int gk20a_init_support(struct platform_device *dev) mutex_init(&g->dbg_sessions_lock); mutex_init(&g->client_lock); + mutex_init(&g->ch_wdt_lock); g->remove_support = gk20a_remove_support; return 0; @@ -1449,6 +1450,7 @@ static int gk20a_probe(struct platform_device *dev) CONFIG_GK20A_DEFAULT_TIMEOUT; if (tegra_platform_is_silicon()) gk20a->timeouts_enabled = true; + gk20a->ch_wdt_enabled = true; /* Set up initial power settings. For non-slicon platforms, disable * * power features and for silicon platforms, read from platform data */ diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index dd7a7ad4..46940744 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -475,6 +475,9 @@ struct gk20a { u32 gr_idle_timeout_default; u32 timeouts_enabled; + u32 ch_wdt_enabled; + struct mutex ch_wdt_lock; + bool slcg_enabled; bool blcg_enabled; bool elcg_enabled; -- cgit v1.2.2