diff options
author | Konsta Holtta <kholtta@nvidia.com> | 2018-02-21 09:42:37 -0500 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2018-03-09 23:09:44 -0500 |
commit | cb6ed949e272f8ad753bf4ab1c0d20c35f31498b (patch) | |
tree | 16d0acad2430e77f9241abe93fae61937e317373 | |
parent | 4f9368522ea18e3734798d2032b21c58dbb93a04 (diff) |
gpu: nvgpu: support per-channel wdt timeouts
Replace the padding in nvgpu_channel_wdt_args with a timeout value in
milliseconds, and add NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT to
signify the existence of this new field. When the new flag is included
in the value of wdt_status, the field is used to set a per-channel
timeout to override the per-GPU default.
Add NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP to disable the long debug
dump when a timed out channel gets recovered by the watchdog. Printing
the dump to serial console takes easily several seconds. (Note that
there is NVGPU_TIMEOUT_FLAG_DISABLE_DUMP about ctxsw timeout separately
for NVGPU_IOCTL_CHANNEL_SET_TIMEOUT_EX as well.)
The behaviour of NVGPU_IOCTL_CHANNEL_WDT is changed so that either
NVGPU_IOCTL_CHANNEL_ENABLE_WDT or NVGPU_IOCTL_CHANNEL_DISABLE_WDT has to
be set. The old behaviour was that other values were silently ignored.
The usage of the global default debugfs-controlled ch_wdt_timeout_ms is
changed so that its value takes effect only for newly opened channels
instead of in realtime. Also, zero value no longer means that the
watchdog is disabled; there is a separate flag for that after all.
gk20a_fifo_recover_tsg used to ignore the value of "verbose" when no
engines were found. Correct this.
Bug 1982826
Bug 1985845
Jira NVGPU-73
Change-Id: Iea6213a646a66cb7c631ed7d7c91d8c2ba8a92a4
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1510898
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r-- | drivers/gpu/nvgpu/common/linux/channel.c | 4 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/common/linux/ioctl_channel.c | 19 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/ce2_gk20a.c | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 34 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 9 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 2 | ||||
-rw-r--r-- | include/uapi/linux/nvgpu.h | 10 |
7 files changed, 50 insertions, 30 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/channel.c b/drivers/gpu/nvgpu/common/linux/channel.c index 8bfa4cfc..ea294738 100644 --- a/drivers/gpu/nvgpu/common/linux/channel.c +++ b/drivers/gpu/nvgpu/common/linux/channel.c | |||
@@ -753,7 +753,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, | |||
753 | */ | 753 | */ |
754 | need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) || | 754 | need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) || |
755 | (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) || | 755 | (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) || |
756 | c->wdt_enabled || | 756 | c->timeout.enabled || |
757 | (g->can_railgate && !c->deterministic) || | 757 | (g->can_railgate && !c->deterministic) || |
758 | !skip_buffer_refcounting; | 758 | !skip_buffer_refcounting; |
759 | 759 | ||
@@ -791,7 +791,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, | |||
791 | */ | 791 | */ |
792 | need_deferred_cleanup = !c->deterministic || | 792 | need_deferred_cleanup = !c->deterministic || |
793 | need_sync_framework || | 793 | need_sync_framework || |
794 | c->wdt_enabled || | 794 | c->timeout.enabled || |
795 | (g->can_railgate && | 795 | (g->can_railgate && |
796 | !c->deterministic) || | 796 | !c->deterministic) || |
797 | !skip_buffer_refcounting; | 797 | !skip_buffer_refcounting; |
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c index 0acaa61d..01355b78 100644 --- a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c +++ b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c | |||
@@ -319,10 +319,21 @@ static int gk20a_channel_cycle_stats_snapshot(struct channel_gk20a *ch, | |||
319 | static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch, | 319 | static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch, |
320 | struct nvgpu_channel_wdt_args *args) | 320 | struct nvgpu_channel_wdt_args *args) |
321 | { | 321 | { |
322 | if (args->wdt_status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT) | 322 | u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT | |
323 | ch->wdt_enabled = false; | 323 | NVGPU_IOCTL_CHANNEL_ENABLE_WDT); |
324 | else if (args->wdt_status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT) | 324 | |
325 | ch->wdt_enabled = true; | 325 | if (status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT) |
326 | ch->timeout.enabled = false; | ||
327 | else if (status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT) | ||
328 | ch->timeout.enabled = true; | ||
329 | else | ||
330 | return -EINVAL; | ||
331 | |||
332 | if (args->wdt_status & NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT) | ||
333 | ch->timeout.limit_ms = args->timeout_ms; | ||
334 | |||
335 | ch->timeout.debug_dump = (args->wdt_status & | ||
336 | NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0; | ||
326 | 337 | ||
327 | return 0; | 338 | return 0; |
328 | } | 339 | } |
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index 18878991..44a10659 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c | |||
@@ -443,7 +443,7 @@ u32 gk20a_ce_create_context(struct gk20a *g, | |||
443 | err = -ENOMEM; | 443 | err = -ENOMEM; |
444 | goto end; | 444 | goto end; |
445 | } | 445 | } |
446 | ce_ctx->ch->wdt_enabled = false; | 446 | ce_ctx->ch->timeout.enabled = false; |
447 | 447 | ||
448 | /* bind the channel to the vm */ | 448 | /* bind the channel to the vm */ |
449 | err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch); | 449 | err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch); |
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index f9b9c6e6..5cd7223f 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c | |||
@@ -61,8 +61,6 @@ static void channel_gk20a_joblist_delete(struct channel_gk20a *c, | |||
61 | static struct channel_gk20a_job *channel_gk20a_joblist_peek( | 61 | static struct channel_gk20a_job *channel_gk20a_joblist_peek( |
62 | struct channel_gk20a *c); | 62 | struct channel_gk20a *c); |
63 | 63 | ||
64 | static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch); | ||
65 | |||
66 | /* allocate GPU channel */ | 64 | /* allocate GPU channel */ |
67 | static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f) | 65 | static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f) |
68 | { | 66 | { |
@@ -696,14 +694,19 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g, | |||
696 | /* By default, channel is regular (non-TSG) channel */ | 694 | /* By default, channel is regular (non-TSG) channel */ |
697 | ch->tsgid = NVGPU_INVALID_TSG_ID; | 695 | ch->tsgid = NVGPU_INVALID_TSG_ID; |
698 | 696 | ||
699 | /* reset timeout counter and update timestamp */ | 697 | /* clear ctxsw timeout counter and update timestamp */ |
700 | ch->timeout_accumulated_ms = 0; | 698 | ch->timeout_accumulated_ms = 0; |
701 | ch->timeout_gpfifo_get = 0; | 699 | ch->timeout_gpfifo_get = 0; |
702 | /* set gr host default timeout */ | 700 | /* set gr host default timeout */ |
703 | ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g); | 701 | ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g); |
704 | ch->timeout_debug_dump = true; | 702 | ch->timeout_debug_dump = true; |
705 | ch->has_timedout = false; | 703 | ch->has_timedout = false; |
706 | ch->wdt_enabled = true; | 704 | |
705 | /* init kernel watchdog timeout */ | ||
706 | ch->timeout.enabled = true; | ||
707 | ch->timeout.limit_ms = g->ch_wdt_timeout_ms; | ||
708 | ch->timeout.debug_dump = true; | ||
709 | |||
707 | ch->obj_class = 0; | 710 | ch->obj_class = 0; |
708 | ch->subctx_id = 0; | 711 | ch->subctx_id = 0; |
709 | ch->runqueue_sel = 0; | 712 | ch->runqueue_sel = 0; |
@@ -1166,10 +1169,10 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c, | |||
1166 | } | 1169 | } |
1167 | } | 1170 | } |
1168 | 1171 | ||
1169 | if (!c->g->timeouts_enabled || !c->wdt_enabled) | 1172 | if (!c->g->timeouts_enabled || !c->timeout.enabled) |
1170 | acquire_timeout = 0; | 1173 | acquire_timeout = 0; |
1171 | else | 1174 | else |
1172 | acquire_timeout = gk20a_get_channel_watchdog_timeout(c); | 1175 | acquire_timeout = c->timeout.limit_ms; |
1173 | 1176 | ||
1174 | err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va, | 1177 | err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va, |
1175 | c->gpfifo.entry_num, | 1178 | c->gpfifo.entry_num, |
@@ -1265,11 +1268,6 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, | |||
1265 | ch->timeout_accumulated_ms > ch->timeout_ms_max; | 1268 | ch->timeout_accumulated_ms > ch->timeout_ms_max; |
1266 | } | 1269 | } |
1267 | 1270 | ||
1268 | static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch) | ||
1269 | { | ||
1270 | return ch->g->ch_wdt_timeout_ms; | ||
1271 | } | ||
1272 | |||
1273 | u32 nvgpu_get_gp_free_count(struct channel_gk20a *c) | 1271 | u32 nvgpu_get_gp_free_count(struct channel_gk20a *c) |
1274 | { | 1272 | { |
1275 | update_gp_get(c->g, c); | 1273 | update_gp_get(c->g, c); |
@@ -1282,7 +1280,7 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch) | |||
1282 | ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch); | 1280 | ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch); |
1283 | ch->timeout.running = true; | 1281 | ch->timeout.running = true; |
1284 | nvgpu_timeout_init(ch->g, &ch->timeout.timer, | 1282 | nvgpu_timeout_init(ch->g, &ch->timeout.timer, |
1285 | gk20a_get_channel_watchdog_timeout(ch), | 1283 | ch->timeout.limit_ms, |
1286 | NVGPU_TIMER_CPU_TIMER); | 1284 | NVGPU_TIMER_CPU_TIMER); |
1287 | } | 1285 | } |
1288 | 1286 | ||
@@ -1303,10 +1301,10 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch) | |||
1303 | */ | 1301 | */ |
1304 | static void gk20a_channel_timeout_start(struct channel_gk20a *ch) | 1302 | static void gk20a_channel_timeout_start(struct channel_gk20a *ch) |
1305 | { | 1303 | { |
1306 | if (!ch->g->timeouts_enabled || !gk20a_get_channel_watchdog_timeout(ch)) | 1304 | if (!ch->g->timeouts_enabled) |
1307 | return; | 1305 | return; |
1308 | 1306 | ||
1309 | if (!ch->wdt_enabled) | 1307 | if (!ch->timeout.enabled) |
1310 | return; | 1308 | return; |
1311 | 1309 | ||
1312 | nvgpu_raw_spinlock_acquire(&ch->timeout.lock); | 1310 | nvgpu_raw_spinlock_acquire(&ch->timeout.lock); |
@@ -1425,11 +1423,13 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch) | |||
1425 | nvgpu_err(g, "Job on channel %d timed out", | 1423 | nvgpu_err(g, "Job on channel %d timed out", |
1426 | ch->chid); | 1424 | ch->chid); |
1427 | 1425 | ||
1428 | gk20a_debug_dump(g); | 1426 | /* force reset calls gk20a_debug_dump but not this */ |
1429 | gk20a_gr_debug_dump(g); | 1427 | if (ch->timeout.debug_dump) |
1428 | gk20a_gr_debug_dump(g); | ||
1430 | 1429 | ||
1431 | g->ops.fifo.force_reset_ch(ch, | 1430 | g->ops.fifo.force_reset_ch(ch, |
1432 | NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, true); | 1431 | NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, |
1432 | ch->timeout.debug_dump); | ||
1433 | } | 1433 | } |
1434 | 1434 | ||
1435 | /** | 1435 | /** |
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index edb645b5..947b8913 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h | |||
@@ -96,11 +96,17 @@ struct channel_gk20a_joblist { | |||
96 | }; | 96 | }; |
97 | 97 | ||
98 | struct channel_gk20a_timeout { | 98 | struct channel_gk20a_timeout { |
99 | /* lock protects the running timer state */ | ||
99 | struct nvgpu_raw_spinlock lock; | 100 | struct nvgpu_raw_spinlock lock; |
100 | struct nvgpu_timeout timer; | 101 | struct nvgpu_timeout timer; |
101 | bool running; | 102 | bool running; |
102 | u32 gp_get; | 103 | u32 gp_get; |
103 | u64 pb_get; | 104 | u64 pb_get; |
105 | |||
106 | /* lock not needed */ | ||
107 | u32 limit_ms; | ||
108 | bool enabled; | ||
109 | bool debug_dump; | ||
104 | }; | 110 | }; |
105 | 111 | ||
106 | /* | 112 | /* |
@@ -167,7 +173,6 @@ struct channel_gk20a { | |||
167 | struct nvgpu_semaphore_int *hw_sema; | 173 | struct nvgpu_semaphore_int *hw_sema; |
168 | 174 | ||
169 | int chid; | 175 | int chid; |
170 | bool wdt_enabled; | ||
171 | nvgpu_atomic_t bound; | 176 | nvgpu_atomic_t bound; |
172 | bool vpr; | 177 | bool vpr; |
173 | bool deterministic; | 178 | bool deterministic; |
@@ -203,7 +208,9 @@ struct channel_gk20a { | |||
203 | u32 timeout_accumulated_ms; | 208 | u32 timeout_accumulated_ms; |
204 | u32 timeout_gpfifo_get; | 209 | u32 timeout_gpfifo_get; |
205 | 210 | ||
211 | /* kernel watchdog to kill stuck jobs */ | ||
206 | struct channel_gk20a_timeout timeout; | 212 | struct channel_gk20a_timeout timeout; |
213 | |||
207 | /* for job cleanup handling in the background worker */ | 214 | /* for job cleanup handling in the background worker */ |
208 | struct nvgpu_list_node worker_item; | 215 | struct nvgpu_list_node worker_item; |
209 | 216 | ||
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index 258006f9..96317520 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | |||
@@ -1808,7 +1808,7 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose) | |||
1808 | else { | 1808 | else { |
1809 | struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; | 1809 | struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; |
1810 | 1810 | ||
1811 | if (gk20a_fifo_error_tsg(g, tsg)) | 1811 | if (gk20a_fifo_error_tsg(g, tsg) && verbose) |
1812 | gk20a_debug_dump(g); | 1812 | gk20a_debug_dump(g); |
1813 | 1813 | ||
1814 | gk20a_fifo_abort_tsg(g, tsgid, false); | 1814 | gk20a_fifo_abort_tsg(g, tsgid, false); |
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h index cf75595a..8a578102 100644 --- a/include/uapi/linux/nvgpu.h +++ b/include/uapi/linux/nvgpu.h | |||
@@ -1577,13 +1577,15 @@ struct nvgpu_cycle_stats_snapshot_args { | |||
1577 | #define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH 1 | 1577 | #define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH 1 |
1578 | #define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH 2 | 1578 | #define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH 2 |
1579 | 1579 | ||
1580 | /* disable watchdog per-channel */ | 1580 | /* configure watchdog per-channel */ |
1581 | struct nvgpu_channel_wdt_args { | 1581 | struct nvgpu_channel_wdt_args { |
1582 | __u32 wdt_status; | 1582 | __u32 wdt_status; |
1583 | __u32 padding; | 1583 | __u32 timeout_ms; |
1584 | }; | 1584 | }; |
1585 | #define NVGPU_IOCTL_CHANNEL_DISABLE_WDT 1 | 1585 | #define NVGPU_IOCTL_CHANNEL_DISABLE_WDT (1 << 0) |
1586 | #define NVGPU_IOCTL_CHANNEL_ENABLE_WDT 2 | 1586 | #define NVGPU_IOCTL_CHANNEL_ENABLE_WDT (1 << 1) |
1587 | #define NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT (1 << 2) | ||
1588 | #define NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP (1 << 3) | ||
1587 | 1589 | ||
1588 | /* | 1590 | /* |
1589 | * Interleaving channels in a runlist is an approach to improve | 1591 | * Interleaving channels in a runlist is an approach to improve |