diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2023-03-16 02:18:35 -0400 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2023-03-16 02:18:35 -0400 |
commit | 9ed721de0e9ce564b7c852e38359398b019a5c2f (patch) | |
tree | 8dc52ba6d8521741d2c1aa4030196e54670c2efc /libsmctrl.c | |
parent | d827c6c152c8dd52463f82ef11ccdfc66083a9db (diff) |
Introduce pysmctrl: A python interface to libsmctrl
Initially supports the GPU information functions via:
- pysmctrl.get_gpc_info(dev_id)
- pysmctrl.get_tpc_info(dev_id)
- pysmctrl.get_tpc_info_cuda(cuda_dev_id)
All functions are extensively documented. See pysmctrl/__init__.py
for details.
Device partitioning functions have yet to be mapped into Python, as
these will require more testing.
As part of this:
- libsmctrl_get_*_info() functions have been modified to consistently
return positive error codes.
- libsmctrl_get_tpc_info() now uses nvdebug-style device numbering and
uses libsmctrl_get_gpc_info() under the covers. This should be more
reliable.
- libsmctrl_get_tpc_info_cuda() has been introduced as an improved
version of the old libsmctrl_get_tpc_info() function. This continues
to use CUDA-style device numbering, but is now resiliant to CUDA
failures.
- Various minor style improvements in libsmctrl.c
Diffstat (limited to 'libsmctrl.c')
-rw-r--r-- | libsmctrl.c | 91 |
1 files changed, 60 insertions, 31 deletions
diff --git a/libsmctrl.c b/libsmctrl.c index 640001a..98be1ef 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
@@ -226,13 +226,15 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | |||
226 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); | 226 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); |
227 | break; | 227 | break; |
228 | default: { | 228 | default: { |
229 | // For experimenting to determine the right mask offset, set MASK_OFF (positive and negative numbers supported) | 229 | // For experimenting to determine the right mask offset, set the MASK_OFF |
230 | // environment variable (positive and negative numbers are supported) | ||
230 | char* mask_off_str = getenv("MASK_OFF"); | 231 | char* mask_off_str = getenv("MASK_OFF"); |
231 | fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); | 232 | fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); |
232 | if (mask_off_str) { | 233 | if (mask_off_str) { |
233 | int off = atoi(mask_off_str); | 234 | int off = atoi(mask_off_str); |
234 | fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 11.8 base %#x (total off: %#x)\n", off, CU_11_8_MASK_OFF, CU_11_8_MASK_OFF+off); | 235 | fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x " |
235 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF + off); | 236 | "(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off); |
237 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off); | ||
236 | } else { | 238 | } else { |
237 | return; | 239 | return; |
238 | }} | 240 | }} |
@@ -242,44 +244,21 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | |||
242 | hw_mask->lower = mask; | 244 | hw_mask->lower = mask; |
243 | } | 245 | } |
244 | 246 | ||
245 | int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { | 247 | /* INFORMATIONAL FUNCTIONS */ |
246 | int num_sms; | ||
247 | int major; | ||
248 | int minor; | ||
249 | // TODO: Use nvdebug instead of this hardcoded hack | ||
250 | cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); | ||
251 | cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); | ||
252 | cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); | ||
253 | // SM masking only works on sm_35+ | ||
254 | if (major < 3 || (major == 3 && minor < 5)) | ||
255 | return -ENOTSUP; | ||
256 | // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well | ||
257 | // as the P100, which is uniquely sm_60 | ||
258 | int sms_per_tpc; | ||
259 | if (major > 6 || (major == 6 && minor == 0)) | ||
260 | sms_per_tpc = 2; | ||
261 | else | ||
262 | sms_per_tpc = 1; | ||
263 | // It looks like there may be some upcoming weirdness (TPCs with only one SM?) | ||
264 | // with Hopper | ||
265 | if (major >= 9) | ||
266 | fprintf(stderr, "libsmctrl: WARNING, SM masking is untested on Hopper, and will likely yield incorrect results! Proceed with caution.\n"); | ||
267 | *num_tpcs = num_sms/sms_per_tpc; | ||
268 | return 0; | ||
269 | } | ||
270 | 248 | ||
271 | // Read an integer from a file in `/proc` | 249 | // Read an integer from a file in `/proc` |
272 | static int read_int_procfile(char* filename, uint64_t* out) { | 250 | static int read_int_procfile(char* filename, uint64_t* out) { |
273 | char f_data[18] = {0}; | 251 | char f_data[18] = {0}; |
274 | int fd = open(filename, O_RDONLY); | 252 | int fd = open(filename, O_RDONLY); |
275 | if (fd == -1) | 253 | if (fd == -1) |
276 | return -errno; | 254 | return errno; |
277 | read(fd, f_data, 18); | 255 | read(fd, f_data, 18); |
278 | close(fd); | 256 | close(fd); |
279 | *out = strtoll(f_data, NULL, 16); | 257 | *out = strtoll(f_data, NULL, 16); |
280 | return 0; | 258 | return 0; |
281 | } | 259 | } |
282 | 260 | ||
261 | // We support up to 12 GPCs per GPU, and up to 16 GPUs. | ||
283 | static uint64_t tpc_mask_per_gpc_per_dev[16][12]; | 262 | static uint64_t tpc_mask_per_gpc_per_dev[16][12]; |
284 | // Output mask is vtpc-indexed (virtual TPC) | 263 | // Output mask is vtpc-indexed (virtual TPC) |
285 | int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) { | 264 | int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) { |
@@ -291,13 +270,14 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, | |||
291 | // Maximum number of GPCs supported for this chip | 270 | // Maximum number of GPCs supported for this chip |
292 | snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev); | 271 | snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev); |
293 | if (err = read_int_procfile(filename, &max_gpcs)) { | 272 | if (err = read_int_procfile(filename, &max_gpcs)) { |
294 | fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before using libsmctrl_get_gpc_info()\n"); | 273 | fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before " |
274 | "using libsmctrl_get_*_info() functions\n"); | ||
295 | return err; | 275 | return err; |
296 | } | 276 | } |
297 | // TODO: handle arbitrary-size GPUs | 277 | // TODO: handle arbitrary-size GPUs |
298 | if (dev > 16 || max_gpcs > 12) { | 278 | if (dev > 16 || max_gpcs > 12) { |
299 | fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n"); | 279 | fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n"); |
300 | return -ERANGE; | 280 | return ERANGE; |
301 | } | 281 | } |
302 | // Set bit = disabled GPC | 282 | // Set bit = disabled GPC |
303 | snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev); | 283 | snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev); |
@@ -331,3 +311,52 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, | |||
331 | return 0; | 311 | return 0; |
332 | } | 312 | } |
333 | 313 | ||
314 | int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { | ||
315 | uint32_t num_gpcs; | ||
316 | uint64_t* tpcs_per_gpc; | ||
317 | int res; | ||
318 | if (res = libsmctrl_get_gpc_info(&num_gpcs, &tpcs_per_gpc, dev)) | ||
319 | return res; | ||
320 | *num_tpcs = 0; | ||
321 | for (int gpc = 0; gpc < num_gpcs; gpc++) { | ||
322 | *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]); | ||
323 | } | ||
324 | return 0; | ||
325 | } | ||
326 | |||
327 | // @param dev Device index as understood by CUDA **can differ from nvdebug idx** | ||
328 | // This implementation is fragile, and could be incorrect for odd GPUs | ||
329 | int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) { | ||
330 | int num_sms, major, minor, res = 0; | ||
331 | const char* err_str; | ||
332 | if (res = cuInit(0)) | ||
333 | goto abort_cuda; | ||
334 | if (res = cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_dev)) | ||
335 | goto abort_cuda; | ||
336 | if (res = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_dev)) | ||
337 | goto abort_cuda; | ||
338 | if (res = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_dev)) | ||
339 | goto abort_cuda; | ||
340 | // SM masking only works on sm_35+ | ||
341 | if (major < 3 || (major == 3 && minor < 5)) | ||
342 | return ENOTSUP; | ||
343 | // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well | ||
344 | // as the P100, which is uniquely sm_60 | ||
345 | int sms_per_tpc; | ||
346 | if (major > 6 || (major == 6 && minor == 0)) | ||
347 | sms_per_tpc = 2; | ||
348 | else | ||
349 | sms_per_tpc = 1; | ||
350 | // It looks like there may be some upcoming weirdness (TPCs with only one SM?) | ||
351 | // with Hopper | ||
352 | if (major >= 9) | ||
353 | fprintf(stderr, "libsmctrl: WARNING, TPC masking is untested on Hopper," | ||
354 | " and will likely yield incorrect results! Proceed with caution.\n"); | ||
355 | *num_tpcs = num_sms/sms_per_tpc; | ||
356 | return 0; | ||
357 | abort_cuda: | ||
358 | cuGetErrorName(res, &err_str); | ||
359 | fprintf(stderr, "libsmctrl: CUDA call failed due to %s. Failing with EIO...\n", err_str); | ||
360 | return EIO; | ||
361 | } | ||
362 | |||