diff options
author | Joshua Bakita <jbakita@cs.unc.edu> | 2025-06-25 20:01:19 -0400 |
---|---|---|
committer | Joshua Bakita <jbakita@cs.unc.edu> | 2025-06-25 20:43:38 -0400 |
commit | 9b4b4f71fd843c3ec97ca6f55935675e62ca31f5 (patch) | |
tree | e38f415c07415078e37ad8e1a0e08b6ff513114b /nvtaskset.c | |
parent | e0cb12762d048e388a81cffae3e96bfe2bd672cc (diff) |
Speed up nvtaskset by skipping CUDA context creation if possible
The GPU needs to be on before the GPC-to-TPC mapping registers can
be read. The easiest way to power on the GPU is to create a CUDA
context, but this is fairly expensive.
In nvtaskset, some GPU-using task will likely already be running,
and so we can skip CUDA context creation in the common case.
Bug fixes:
- Delete the temporary CUDA context created by nvtaskset after
it is done with it. Fixes bug where nvtaskset would leak this
context into any program it launches.
Additional changes:
- Style fixes in libsmctrl.c
- Remove superfluous newlines from error() calls in nvtaskset.c
Diffstat (limited to 'nvtaskset.c')
-rw-r--r-- | nvtaskset.c | 29 |
1 files changed, 27 insertions, 2 deletions
diff --git a/nvtaskset.c b/nvtaskset.c index 4901cbe..5cf3a85 100644 --- a/nvtaskset.c +++ b/nvtaskset.c | |||
@@ -68,6 +68,25 @@ void libsmctrl_get_gpc_info_ext_easy(uint32_t* num_gpcs, uint128_t** masks, int | |||
68 | int res; | 68 | int res; |
69 | CUcontext ctx; | 69 | CUcontext ctx; |
70 | char *old_order = NULL; | 70 | char *old_order = NULL; |
71 | int old_stderr, dev_null_fd; | ||
72 | |||
73 | // Attempt to read the configuration, assuming the GPU is on, and fall | ||
74 | // back to creating a context if this fails. | ||
75 | // (Creating a CUDA context is very expensive and best avoided) | ||
76 | // (Redirect stderr while doing this to mute libsmctrl error messages) | ||
77 | if ((dev_null_fd = open("/dev/null", O_WRONLY)) == -1) | ||
78 | error(1, errno, "Unable to open /dev/null"); | ||
79 | if (old_stderr = dup(STDERR_FILENO) == -1) | ||
80 | error(1, errno, "Unable to duplicate stderr file descriptor"); | ||
81 | if (dup2(dev_null_fd, STDERR_FILENO) == -1) | ||
82 | error(1, errno, "Unable to overwrite stderr file descriptor"); | ||
83 | res = libsmctrl_get_gpc_info_ext(num_gpcs, masks, gpu_id); | ||
84 | if (dup2(old_stderr, STDERR_FILENO) == -1) | ||
85 | error(1, errno, "Unable to restore stderr file descriptor"); | ||
86 | // End if we were successful, otherwise fallback | ||
87 | if (res == 0) | ||
88 | return; | ||
89 | |||
71 | // Tell CUDA to use PCI device id ordering (to match nvdebug) | 90 | // Tell CUDA to use PCI device id ordering (to match nvdebug) |
72 | putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID"); | 91 | putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID"); |
73 | // Allow CUDA to see all devices (to better match nvdebug) | 92 | // Allow CUDA to see all devices (to better match nvdebug) |
@@ -80,12 +99,12 @@ void libsmctrl_get_gpc_info_ext_easy(uint32_t* num_gpcs, uint128_t** masks, int | |||
80 | if ((res = cuInit(0))) { | 99 | if ((res = cuInit(0))) { |
81 | const char* name; | 100 | const char* name; |
82 | cuGetErrorName(res, &name); | 101 | cuGetErrorName(res, &name); |
83 | error(1, 0, "Unable to create a initialize CUDA, error %s\n", name); | 102 | error(1, 0, "Unable to create a initialize CUDA, error %s", name); |
84 | } | 103 | } |
85 | if ((res = cuCtxCreate(&ctx, 0, gpu_id))) { | 104 | if ((res = cuCtxCreate(&ctx, 0, gpu_id))) { |
86 | const char* name; | 105 | const char* name; |
87 | cuGetErrorName(res, &name); | 106 | cuGetErrorName(res, &name); |
88 | error(1, 0, "Unable to create a CUDA context, error %s\n", name); | 107 | error(1, 0, "Unable to create a CUDA context, error %s", name); |
89 | } | 108 | } |
90 | // Pull topology information from libsmctrl | 109 | // Pull topology information from libsmctrl |
91 | if ((res = libsmctrl_get_gpc_info_ext(num_gpcs, masks, gpu_id)) != 0) { | 110 | if ((res = libsmctrl_get_gpc_info_ext(num_gpcs, masks, gpu_id)) != 0) { |
@@ -96,6 +115,12 @@ void libsmctrl_get_gpc_info_ext_easy(uint32_t* num_gpcs, uint128_t** masks, int | |||
96 | fprintf(stderr, "%s: Is the GPU powered on, i.e., is there an active context?\n", program_invocation_name); | 115 | fprintf(stderr, "%s: Is the GPU powered on, i.e., is there an active context?\n", program_invocation_name); |
97 | exit(1); | 116 | exit(1); |
98 | } | 117 | } |
118 | // Delete the CUDA context | ||
119 | if (res = cuCtxDestroy(ctx)) { | ||
120 | const char* name; | ||
121 | cuGetErrorName(res, &name); | ||
122 | error(1, 0, "Unable to destroy CUDA context, error %s", name); | ||
123 | } | ||
99 | // Restore the environment (in case we exec() later) | 124 | // Restore the environment (in case we exec() later) |
100 | unsetenv("CUDA_DEVICE_ORDER"); | 125 | unsetenv("CUDA_DEVICE_ORDER"); |
101 | if (old_order) { | 126 | if (old_order) { |