aboutsummaryrefslogtreecommitdiffstats
path: root/nvdebug_entry.c
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2024-04-21 20:30:08 -0400
committerJoshua Bakita <bakitajoshua@gmail.com>2024-04-21 20:30:08 -0400
commit091c242c9ef7cbd8d88d3beae936b14f5b907286 (patch)
tree85f76f37209abc3888045ff2db8576ad3c6370d7 /nvdebug_entry.c
parent684c20c0afbfc2c2075a00881fbb3f9d3e68e023 (diff)
Add /proc/gpu#/resubmit_runlist API
Resubmits the runlist in an identical configuration. Causes the runlist scheduler to: 1. Reload and cache timeslice and scale values from TSGs. 2. Restart scheduling from the head of the runlist [may cause a preempt to be scheduled for the currently-running task (?)]. 3. Address (?) an errata on Turing where re-enabled channels are not always detected. Above behavior tested on GV100 and partially tested on TU102.
Diffstat (limited to 'nvdebug_entry.c')
-rw-r--r--nvdebug_entry.c12
1 files changed, 12 insertions, 0 deletions
diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index 24fcd32..eee7351 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -25,6 +25,7 @@ MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");
25// runlist_procfs.c 25// runlist_procfs.c
26extern struct file_operations runlist_file_ops; 26extern struct file_operations runlist_file_ops;
27extern struct file_operations preempt_tsg_file_ops; 27extern struct file_operations preempt_tsg_file_ops;
28extern struct file_operations resubmit_runlist_file_ops;
28extern struct file_operations disable_channel_file_ops; 29extern struct file_operations disable_channel_file_ops;
29extern struct file_operations enable_channel_file_ops; 30extern struct file_operations enable_channel_file_ops;
30extern struct file_operations switch_to_tsg_file_ops; 31extern struct file_operations switch_to_tsg_file_ops;
@@ -256,6 +257,17 @@ int __init nvdebug_init(void) {
256 "preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops), 257 "preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops),
257 (void*)device_id)) 258 (void*)device_id))
258 goto out_nomem; 259 goto out_nomem;
260 /* On the TU104, the context scheduler (contained in the Host, aka
261 * PFIFO, unit) has been observed to sometimes to fail to schedule TSGs
262 * containing re-enabled channels. Resubmitting the runlist
263 * configuration appears to remediate this condition, and so this API
264 * is exposed to help reset GPU scheduling as necessary.
265 */
266 // Create file `/proc/gpu#/resubmit_runlist`, world writable
267 if (!proc_create_data(
268 "resubmit_runlist", 0222, dir, compat_ops(&resubmit_runlist_file_ops),
269 (void*)device_id))
270 goto out_nomem;
259 // Create file `/proc/gpu#/disable_channel`, world writable 271 // Create file `/proc/gpu#/disable_channel`, world writable
260 if (!proc_create_data( 272 if (!proc_create_data(
261 "disable_channel", 0222, dir, compat_ops(&disable_channel_file_ops), 273 "disable_channel", 0222, dir, compat_ops(&disable_channel_file_ops),