diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2024-04-21 20:30:08 -0400 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2024-04-21 20:30:08 -0400 |
commit | 091c242c9ef7cbd8d88d3beae936b14f5b907286 (patch) | |
tree | 85f76f37209abc3888045ff2db8576ad3c6370d7 /nvdebug_entry.c | |
parent | 684c20c0afbfc2c2075a00881fbb3f9d3e68e023 (diff) |
Add /proc/gpu#/resubmit_runlist API
Resubmits the runlist in an identical configuration. Causes the
runlist scheduler to:
1. Reload and cache timeslice and scale values from TSGs.
2. Restart scheduling from the head of the runlist [may cause a
preempt to be scheduled for the currently-running task (?)].
3. Address (?) an errata on Turing where re-enabled channels are
not always detected.
Above behavior tested on GV100 and partially tested on TU102.
Diffstat (limited to 'nvdebug_entry.c')
-rw-r--r-- | nvdebug_entry.c | 12 |
1 files changed, 12 insertions, 0 deletions
diff --git a/nvdebug_entry.c b/nvdebug_entry.c index 24fcd32..eee7351 100644 --- a/nvdebug_entry.c +++ b/nvdebug_entry.c | |||
@@ -25,6 +25,7 @@ MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs"); | |||
25 | // runlist_procfs.c | 25 | // runlist_procfs.c |
26 | extern struct file_operations runlist_file_ops; | 26 | extern struct file_operations runlist_file_ops; |
27 | extern struct file_operations preempt_tsg_file_ops; | 27 | extern struct file_operations preempt_tsg_file_ops; |
28 | extern struct file_operations resubmit_runlist_file_ops; | ||
28 | extern struct file_operations disable_channel_file_ops; | 29 | extern struct file_operations disable_channel_file_ops; |
29 | extern struct file_operations enable_channel_file_ops; | 30 | extern struct file_operations enable_channel_file_ops; |
30 | extern struct file_operations switch_to_tsg_file_ops; | 31 | extern struct file_operations switch_to_tsg_file_ops; |
@@ -256,6 +257,17 @@ int __init nvdebug_init(void) { | |||
256 | "preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops), | 257 | "preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops), |
257 | (void*)device_id)) | 258 | (void*)device_id)) |
258 | goto out_nomem; | 259 | goto out_nomem; |
260 | /* On the TU104, the context scheduler (contained in the Host, aka | ||
261 | * PFIFO, unit) has been observed to sometimes to fail to schedule TSGs | ||
262 | * containing re-enabled channels. Resubmitting the runlist | ||
263 | * configuration appears to remediate this condition, and so this API | ||
264 | * is exposed to help reset GPU scheduling as necessary. | ||
265 | */ | ||
266 | // Create file `/proc/gpu#/resubmit_runlist`, world writable | ||
267 | if (!proc_create_data( | ||
268 | "resubmit_runlist", 0222, dir, compat_ops(&resubmit_runlist_file_ops), | ||
269 | (void*)device_id)) | ||
270 | goto out_nomem; | ||
259 | // Create file `/proc/gpu#/disable_channel`, world writable | 271 | // Create file `/proc/gpu#/disable_channel`, world writable |
260 | if (!proc_create_data( | 272 | if (!proc_create_data( |
261 | "disable_channel", 0222, dir, compat_ops(&disable_channel_file_ops), | 273 | "disable_channel", 0222, dir, compat_ops(&disable_channel_file_ops), |