summaryrefslogtreecommitdiffstats
path: root/drivers/gpu
diff options
context:
space:
mode:
authorKonsta Holtta <kholtta@nvidia.com>2018-02-21 09:42:37 -0500
committermobile promotions <svcmobile_promotions@nvidia.com>2018-03-09 23:09:44 -0500
commitcb6ed949e272f8ad753bf4ab1c0d20c35f31498b (patch)
tree16d0acad2430e77f9241abe93fae61937e317373 /drivers/gpu
parent4f9368522ea18e3734798d2032b21c58dbb93a04 (diff)
gpu: nvgpu: support per-channel wdt timeouts
Replace the padding in nvgpu_channel_wdt_args with a timeout value in milliseconds, and add NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT to signify the existence of this new field. When the new flag is included in the value of wdt_status, the field is used to set a per-channel timeout to override the per-GPU default. Add NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP to disable the long debug dump when a timed out channel gets recovered by the watchdog. Printing the dump to serial console takes easily several seconds. (Note that there is NVGPU_TIMEOUT_FLAG_DISABLE_DUMP about ctxsw timeout separately for NVGPU_IOCTL_CHANNEL_SET_TIMEOUT_EX as well.) The behaviour of NVGPU_IOCTL_CHANNEL_WDT is changed so that either NVGPU_IOCTL_CHANNEL_ENABLE_WDT or NVGPU_IOCTL_CHANNEL_DISABLE_WDT has to be set. The old behaviour was that other values were silently ignored. The usage of the global default debugfs-controlled ch_wdt_timeout_ms is changed so that its value takes effect only for newly opened channels instead of in realtime. Also, zero value no longer means that the watchdog is disabled; there is a separate flag for that after all. gk20a_fifo_recover_tsg used to ignore the value of "verbose" when no engines were found. Correct this. Bug 1982826 Bug 1985845 Jira NVGPU-73 Change-Id: Iea6213a646a66cb7c631ed7d7c91d8c2ba8a92a4 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1510898 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/nvgpu/common/linux/channel.c4
-rw-r--r--drivers/gpu/nvgpu/common/linux/ioctl_channel.c19
-rw-r--r--drivers/gpu/nvgpu/gk20a/ce2_gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c34
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.h9
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c2
6 files changed, 44 insertions, 26 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/channel.c b/drivers/gpu/nvgpu/common/linux/channel.c
index 8bfa4cfc..ea294738 100644
--- a/drivers/gpu/nvgpu/common/linux/channel.c
+++ b/drivers/gpu/nvgpu/common/linux/channel.c
@@ -753,7 +753,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
753 */ 753 */
754 need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) || 754 need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
755 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) || 755 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
756 c->wdt_enabled || 756 c->timeout.enabled ||
757 (g->can_railgate && !c->deterministic) || 757 (g->can_railgate && !c->deterministic) ||
758 !skip_buffer_refcounting; 758 !skip_buffer_refcounting;
759 759
@@ -791,7 +791,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
791 */ 791 */
792 need_deferred_cleanup = !c->deterministic || 792 need_deferred_cleanup = !c->deterministic ||
793 need_sync_framework || 793 need_sync_framework ||
794 c->wdt_enabled || 794 c->timeout.enabled ||
795 (g->can_railgate && 795 (g->can_railgate &&
796 !c->deterministic) || 796 !c->deterministic) ||
797 !skip_buffer_refcounting; 797 !skip_buffer_refcounting;
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
index 0acaa61d..01355b78 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
@@ -319,10 +319,21 @@ static int gk20a_channel_cycle_stats_snapshot(struct channel_gk20a *ch,
319static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch, 319static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
320 struct nvgpu_channel_wdt_args *args) 320 struct nvgpu_channel_wdt_args *args)
321{ 321{
322 if (args->wdt_status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT) 322 u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT |
323 ch->wdt_enabled = false; 323 NVGPU_IOCTL_CHANNEL_ENABLE_WDT);
324 else if (args->wdt_status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT) 324
325 ch->wdt_enabled = true; 325 if (status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT)
326 ch->timeout.enabled = false;
327 else if (status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT)
328 ch->timeout.enabled = true;
329 else
330 return -EINVAL;
331
332 if (args->wdt_status & NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT)
333 ch->timeout.limit_ms = args->timeout_ms;
334
335 ch->timeout.debug_dump = (args->wdt_status &
336 NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0;
326 337
327 return 0; 338 return 0;
328} 339}
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 18878991..44a10659 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -443,7 +443,7 @@ u32 gk20a_ce_create_context(struct gk20a *g,
443 err = -ENOMEM; 443 err = -ENOMEM;
444 goto end; 444 goto end;
445 } 445 }
446 ce_ctx->ch->wdt_enabled = false; 446 ce_ctx->ch->timeout.enabled = false;
447 447
448 /* bind the channel to the vm */ 448 /* bind the channel to the vm */
449 err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch); 449 err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index f9b9c6e6..5cd7223f 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -61,8 +61,6 @@ static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
61static struct channel_gk20a_job *channel_gk20a_joblist_peek( 61static struct channel_gk20a_job *channel_gk20a_joblist_peek(
62 struct channel_gk20a *c); 62 struct channel_gk20a *c);
63 63
64static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch);
65
66/* allocate GPU channel */ 64/* allocate GPU channel */
67static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f) 65static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
68{ 66{
@@ -696,14 +694,19 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
696 /* By default, channel is regular (non-TSG) channel */ 694 /* By default, channel is regular (non-TSG) channel */
697 ch->tsgid = NVGPU_INVALID_TSG_ID; 695 ch->tsgid = NVGPU_INVALID_TSG_ID;
698 696
699 /* reset timeout counter and update timestamp */ 697 /* clear ctxsw timeout counter and update timestamp */
700 ch->timeout_accumulated_ms = 0; 698 ch->timeout_accumulated_ms = 0;
701 ch->timeout_gpfifo_get = 0; 699 ch->timeout_gpfifo_get = 0;
702 /* set gr host default timeout */ 700 /* set gr host default timeout */
703 ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g); 701 ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g);
704 ch->timeout_debug_dump = true; 702 ch->timeout_debug_dump = true;
705 ch->has_timedout = false; 703 ch->has_timedout = false;
706 ch->wdt_enabled = true; 704
705 /* init kernel watchdog timeout */
706 ch->timeout.enabled = true;
707 ch->timeout.limit_ms = g->ch_wdt_timeout_ms;
708 ch->timeout.debug_dump = true;
709
707 ch->obj_class = 0; 710 ch->obj_class = 0;
708 ch->subctx_id = 0; 711 ch->subctx_id = 0;
709 ch->runqueue_sel = 0; 712 ch->runqueue_sel = 0;
@@ -1166,10 +1169,10 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
1166 } 1169 }
1167 } 1170 }
1168 1171
1169 if (!c->g->timeouts_enabled || !c->wdt_enabled) 1172 if (!c->g->timeouts_enabled || !c->timeout.enabled)
1170 acquire_timeout = 0; 1173 acquire_timeout = 0;
1171 else 1174 else
1172 acquire_timeout = gk20a_get_channel_watchdog_timeout(c); 1175 acquire_timeout = c->timeout.limit_ms;
1173 1176
1174 err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va, 1177 err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,
1175 c->gpfifo.entry_num, 1178 c->gpfifo.entry_num,
@@ -1265,11 +1268,6 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
1265 ch->timeout_accumulated_ms > ch->timeout_ms_max; 1268 ch->timeout_accumulated_ms > ch->timeout_ms_max;
1266} 1269}
1267 1270
1268static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
1269{
1270 return ch->g->ch_wdt_timeout_ms;
1271}
1272
1273u32 nvgpu_get_gp_free_count(struct channel_gk20a *c) 1271u32 nvgpu_get_gp_free_count(struct channel_gk20a *c)
1274{ 1272{
1275 update_gp_get(c->g, c); 1273 update_gp_get(c->g, c);
@@ -1282,7 +1280,7 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
1282 ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch); 1280 ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch);
1283 ch->timeout.running = true; 1281 ch->timeout.running = true;
1284 nvgpu_timeout_init(ch->g, &ch->timeout.timer, 1282 nvgpu_timeout_init(ch->g, &ch->timeout.timer,
1285 gk20a_get_channel_watchdog_timeout(ch), 1283 ch->timeout.limit_ms,
1286 NVGPU_TIMER_CPU_TIMER); 1284 NVGPU_TIMER_CPU_TIMER);
1287} 1285}
1288 1286
@@ -1303,10 +1301,10 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
1303 */ 1301 */
1304static void gk20a_channel_timeout_start(struct channel_gk20a *ch) 1302static void gk20a_channel_timeout_start(struct channel_gk20a *ch)
1305{ 1303{
1306 if (!ch->g->timeouts_enabled || !gk20a_get_channel_watchdog_timeout(ch)) 1304 if (!ch->g->timeouts_enabled)
1307 return; 1305 return;
1308 1306
1309 if (!ch->wdt_enabled) 1307 if (!ch->timeout.enabled)
1310 return; 1308 return;
1311 1309
1312 nvgpu_raw_spinlock_acquire(&ch->timeout.lock); 1310 nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
@@ -1425,11 +1423,13 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
1425 nvgpu_err(g, "Job on channel %d timed out", 1423 nvgpu_err(g, "Job on channel %d timed out",
1426 ch->chid); 1424 ch->chid);
1427 1425
1428 gk20a_debug_dump(g); 1426 /* force reset calls gk20a_debug_dump but not this */
1429 gk20a_gr_debug_dump(g); 1427 if (ch->timeout.debug_dump)
1428 gk20a_gr_debug_dump(g);
1430 1429
1431 g->ops.fifo.force_reset_ch(ch, 1430 g->ops.fifo.force_reset_ch(ch,
1432 NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, true); 1431 NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
1432 ch->timeout.debug_dump);
1433} 1433}
1434 1434
1435/** 1435/**
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index edb645b5..947b8913 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -96,11 +96,17 @@ struct channel_gk20a_joblist {
96}; 96};
97 97
98struct channel_gk20a_timeout { 98struct channel_gk20a_timeout {
99 /* lock protects the running timer state */
99 struct nvgpu_raw_spinlock lock; 100 struct nvgpu_raw_spinlock lock;
100 struct nvgpu_timeout timer; 101 struct nvgpu_timeout timer;
101 bool running; 102 bool running;
102 u32 gp_get; 103 u32 gp_get;
103 u64 pb_get; 104 u64 pb_get;
105
106 /* lock not needed */
107 u32 limit_ms;
108 bool enabled;
109 bool debug_dump;
104}; 110};
105 111
106/* 112/*
@@ -167,7 +173,6 @@ struct channel_gk20a {
167 struct nvgpu_semaphore_int *hw_sema; 173 struct nvgpu_semaphore_int *hw_sema;
168 174
169 int chid; 175 int chid;
170 bool wdt_enabled;
171 nvgpu_atomic_t bound; 176 nvgpu_atomic_t bound;
172 bool vpr; 177 bool vpr;
173 bool deterministic; 178 bool deterministic;
@@ -203,7 +208,9 @@ struct channel_gk20a {
203 u32 timeout_accumulated_ms; 208 u32 timeout_accumulated_ms;
204 u32 timeout_gpfifo_get; 209 u32 timeout_gpfifo_get;
205 210
211 /* kernel watchdog to kill stuck jobs */
206 struct channel_gk20a_timeout timeout; 212 struct channel_gk20a_timeout timeout;
213
207 /* for job cleanup handling in the background worker */ 214 /* for job cleanup handling in the background worker */
208 struct nvgpu_list_node worker_item; 215 struct nvgpu_list_node worker_item;
209 216
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 258006f9..96317520 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1808,7 +1808,7 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
1808 else { 1808 else {
1809 struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; 1809 struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
1810 1810
1811 if (gk20a_fifo_error_tsg(g, tsg)) 1811 if (gk20a_fifo_error_tsg(g, tsg) && verbose)
1812 gk20a_debug_dump(g); 1812 gk20a_debug_dump(g);
1813 1813
1814 gk20a_fifo_abort_tsg(g, tsgid, false); 1814 gk20a_fifo_abort_tsg(g, tsgid, false);