From cb6ed949e272f8ad753bf4ab1c0d20c35f31498b Mon Sep 17 00:00:00 2001 From: Konsta Holtta Date: Wed, 21 Feb 2018 16:42:37 +0200 Subject: gpu: nvgpu: support per-channel wdt timeouts Replace the padding in nvgpu_channel_wdt_args with a timeout value in milliseconds, and add NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT to signify the existence of this new field. When the new flag is included in the value of wdt_status, the field is used to set a per-channel timeout to override the per-GPU default. Add NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP to disable the long debug dump when a timed out channel gets recovered by the watchdog. Printing the dump to serial console takes easily several seconds. (Note that there is NVGPU_TIMEOUT_FLAG_DISABLE_DUMP about ctxsw timeout separately for NVGPU_IOCTL_CHANNEL_SET_TIMEOUT_EX as well.) The behaviour of NVGPU_IOCTL_CHANNEL_WDT is changed so that either NVGPU_IOCTL_CHANNEL_ENABLE_WDT or NVGPU_IOCTL_CHANNEL_DISABLE_WDT has to be set. The old behaviour was that other values were silently ignored. The usage of the global default debugfs-controlled ch_wdt_timeout_ms is changed so that its value takes effect only for newly opened channels instead of in realtime. Also, zero value no longer means that the watchdog is disabled; there is a separate flag for that after all. gk20a_fifo_recover_tsg used to ignore the value of "verbose" when no engines were found. Correct this. Bug 1982826 Bug 1985845 Jira NVGPU-73 Change-Id: Iea6213a646a66cb7c631ed7d7c91d8c2ba8a92a4 Signed-off-by: Konsta Holtta Reviewed-on: https://git-master.nvidia.com/r/1510898 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/linux/channel.c | 4 +-- drivers/gpu/nvgpu/common/linux/ioctl_channel.c | 19 +++++++++++--- drivers/gpu/nvgpu/gk20a/ce2_gk20a.c | 2 +- drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 34 +++++++++++++------------- drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 9 ++++++- drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 2 +- 6 files changed, 44 insertions(+), 26 deletions(-) (limited to 'drivers/gpu/nvgpu') diff --git a/drivers/gpu/nvgpu/common/linux/channel.c b/drivers/gpu/nvgpu/common/linux/channel.c index 8bfa4cfc..ea294738 100644 --- a/drivers/gpu/nvgpu/common/linux/channel.c +++ b/drivers/gpu/nvgpu/common/linux/channel.c @@ -753,7 +753,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, */ need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) || (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) || - c->wdt_enabled || + c->timeout.enabled || (g->can_railgate && !c->deterministic) || !skip_buffer_refcounting; @@ -791,7 +791,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, */ need_deferred_cleanup = !c->deterministic || need_sync_framework || - c->wdt_enabled || + c->timeout.enabled || (g->can_railgate && !c->deterministic) || !skip_buffer_refcounting; diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c index 0acaa61d..01355b78 100644 --- a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c +++ b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c @@ -319,10 +319,21 @@ static int gk20a_channel_cycle_stats_snapshot(struct channel_gk20a *ch, static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch, struct nvgpu_channel_wdt_args *args) { - if (args->wdt_status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT) - ch->wdt_enabled = false; - else if (args->wdt_status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT) - ch->wdt_enabled = true; + u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT | + NVGPU_IOCTL_CHANNEL_ENABLE_WDT); + + if (status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT) + ch->timeout.enabled = false; + else if (status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT) + ch->timeout.enabled = true; + else + return -EINVAL; + + if (args->wdt_status & NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT) + ch->timeout.limit_ms = args->timeout_ms; + + ch->timeout.debug_dump = (args->wdt_status & + NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0; return 0; } diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index 18878991..44a10659 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c @@ -443,7 +443,7 @@ u32 gk20a_ce_create_context(struct gk20a *g, err = -ENOMEM; goto end; } - ce_ctx->ch->wdt_enabled = false; + ce_ctx->ch->timeout.enabled = false; /* bind the channel to the vm */ err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch); diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index f9b9c6e6..5cd7223f 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c @@ -61,8 +61,6 @@ static void channel_gk20a_joblist_delete(struct channel_gk20a *c, static struct channel_gk20a_job *channel_gk20a_joblist_peek( struct channel_gk20a *c); -static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch); - /* allocate GPU channel */ static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f) { @@ -696,14 +694,19 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g, /* By default, channel is regular (non-TSG) channel */ ch->tsgid = NVGPU_INVALID_TSG_ID; - /* reset timeout counter and update timestamp */ + /* clear ctxsw timeout counter and update timestamp */ ch->timeout_accumulated_ms = 0; ch->timeout_gpfifo_get = 0; /* set gr host default timeout */ ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g); ch->timeout_debug_dump = true; ch->has_timedout = false; - ch->wdt_enabled = true; + + /* init kernel watchdog timeout */ + ch->timeout.enabled = true; + ch->timeout.limit_ms = g->ch_wdt_timeout_ms; + ch->timeout.debug_dump = true; + ch->obj_class = 0; ch->subctx_id = 0; ch->runqueue_sel = 0; @@ -1166,10 +1169,10 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c, } } - if (!c->g->timeouts_enabled || !c->wdt_enabled) + if (!c->g->timeouts_enabled || !c->timeout.enabled) acquire_timeout = 0; else - acquire_timeout = gk20a_get_channel_watchdog_timeout(c); + acquire_timeout = c->timeout.limit_ms; err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va, c->gpfifo.entry_num, @@ -1265,11 +1268,6 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, ch->timeout_accumulated_ms > ch->timeout_ms_max; } -static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch) -{ - return ch->g->ch_wdt_timeout_ms; -} - u32 nvgpu_get_gp_free_count(struct channel_gk20a *c) { update_gp_get(c->g, c); @@ -1282,7 +1280,7 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch) ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch); ch->timeout.running = true; nvgpu_timeout_init(ch->g, &ch->timeout.timer, - gk20a_get_channel_watchdog_timeout(ch), + ch->timeout.limit_ms, NVGPU_TIMER_CPU_TIMER); } @@ -1303,10 +1301,10 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch) */ static void gk20a_channel_timeout_start(struct channel_gk20a *ch) { - if (!ch->g->timeouts_enabled || !gk20a_get_channel_watchdog_timeout(ch)) + if (!ch->g->timeouts_enabled) return; - if (!ch->wdt_enabled) + if (!ch->timeout.enabled) return; nvgpu_raw_spinlock_acquire(&ch->timeout.lock); @@ -1425,11 +1423,13 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch) nvgpu_err(g, "Job on channel %d timed out", ch->chid); - gk20a_debug_dump(g); - gk20a_gr_debug_dump(g); + /* force reset calls gk20a_debug_dump but not this */ + if (ch->timeout.debug_dump) + gk20a_gr_debug_dump(g); g->ops.fifo.force_reset_ch(ch, - NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, true); + NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, + ch->timeout.debug_dump); } /** diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index edb645b5..947b8913 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h @@ -96,11 +96,17 @@ struct channel_gk20a_joblist { }; struct channel_gk20a_timeout { + /* lock protects the running timer state */ struct nvgpu_raw_spinlock lock; struct nvgpu_timeout timer; bool running; u32 gp_get; u64 pb_get; + + /* lock not needed */ + u32 limit_ms; + bool enabled; + bool debug_dump; }; /* @@ -167,7 +173,6 @@ struct channel_gk20a { struct nvgpu_semaphore_int *hw_sema; int chid; - bool wdt_enabled; nvgpu_atomic_t bound; bool vpr; bool deterministic; @@ -203,7 +208,9 @@ struct channel_gk20a { u32 timeout_accumulated_ms; u32 timeout_gpfifo_get; + /* kernel watchdog to kill stuck jobs */ struct channel_gk20a_timeout timeout; + /* for job cleanup handling in the background worker */ struct nvgpu_list_node worker_item; diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index 258006f9..96317520 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c @@ -1808,7 +1808,7 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose) else { struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; - if (gk20a_fifo_error_tsg(g, tsg)) + if (gk20a_fifo_error_tsg(g, tsg) && verbose) gk20a_debug_dump(g); gk20a_fifo_abort_tsg(g, tsgid, false); -- cgit v1.2.2