diff options
| author | Joshua Bakita <bakitajoshua@gmail.com> | 2023-03-16 02:18:35 -0400 |
|---|---|---|
| committer | Joshua Bakita <bakitajoshua@gmail.com> | 2023-03-16 02:18:35 -0400 |
| commit | 9ed721de0e9ce564b7c852e38359398b019a5c2f (patch) | |
| tree | 8dc52ba6d8521741d2c1aa4030196e54670c2efc | |
| parent | d827c6c152c8dd52463f82ef11ccdfc66083a9db (diff) | |
Introduce pysmctrl: A python interface to libsmctrl
Initially supports the GPU information functions via:
- pysmctrl.get_gpc_info(dev_id)
- pysmctrl.get_tpc_info(dev_id)
- pysmctrl.get_tpc_info_cuda(cuda_dev_id)
All functions are extensively documented. See pysmctrl/__init__.py
for details.
Device partitioning functions have yet to be mapped into Python, as
these will require more testing.
As part of this:
- libsmctrl_get_*_info() functions have been modified to consistently
return positive error codes.
- libsmctrl_get_tpc_info() now uses nvdebug-style device numbering and
uses libsmctrl_get_gpc_info() under the covers. This should be more
reliable.
- libsmctrl_get_tpc_info_cuda() has been introduced as an improved
version of the old libsmctrl_get_tpc_info() function. This continues
to use CUDA-style device numbering, but is now resiliant to CUDA
failures.
- Various minor style improvements in libsmctrl.c
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | libsmctrl.c | 91 | ||||
| -rw-r--r-- | libsmctrl.h | 16 | ||||
| -rw-r--r-- | pysmctrl/__init__.py | 82 |
4 files changed, 154 insertions, 36 deletions
| @@ -2,3 +2,4 @@ libsmctrl.a | |||
| 2 | libsmctrl.o | 2 | libsmctrl.o |
| 3 | libsmctrl.so | 3 | libsmctrl.so |
| 4 | libsmctrl_test_gpc_info | 4 | libsmctrl_test_gpc_info |
| 5 | *.pyc | ||
diff --git a/libsmctrl.c b/libsmctrl.c index 640001a..98be1ef 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
| @@ -226,13 +226,15 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | |||
| 226 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); | 226 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); |
| 227 | break; | 227 | break; |
| 228 | default: { | 228 | default: { |
| 229 | // For experimenting to determine the right mask offset, set MASK_OFF (positive and negative numbers supported) | 229 | // For experimenting to determine the right mask offset, set the MASK_OFF |
| 230 | // environment variable (positive and negative numbers are supported) | ||
| 230 | char* mask_off_str = getenv("MASK_OFF"); | 231 | char* mask_off_str = getenv("MASK_OFF"); |
| 231 | fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); | 232 | fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); |
| 232 | if (mask_off_str) { | 233 | if (mask_off_str) { |
| 233 | int off = atoi(mask_off_str); | 234 | int off = atoi(mask_off_str); |
| 234 | fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 11.8 base %#x (total off: %#x)\n", off, CU_11_8_MASK_OFF, CU_11_8_MASK_OFF+off); | 235 | fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x " |
| 235 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF + off); | 236 | "(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off); |
| 237 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off); | ||
| 236 | } else { | 238 | } else { |
| 237 | return; | 239 | return; |
| 238 | }} | 240 | }} |
| @@ -242,44 +244,21 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | |||
| 242 | hw_mask->lower = mask; | 244 | hw_mask->lower = mask; |
| 243 | } | 245 | } |
| 244 | 246 | ||
| 245 | int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { | 247 | /* INFORMATIONAL FUNCTIONS */ |
| 246 | int num_sms; | ||
| 247 | int major; | ||
| 248 | int minor; | ||
| 249 | // TODO: Use nvdebug instead of this hardcoded hack | ||
| 250 | cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); | ||
| 251 | cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); | ||
| 252 | cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); | ||
| 253 | // SM masking only works on sm_35+ | ||
| 254 | if (major < 3 || (major == 3 && minor < 5)) | ||
| 255 | return -ENOTSUP; | ||
| 256 | // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well | ||
| 257 | // as the P100, which is uniquely sm_60 | ||
| 258 | int sms_per_tpc; | ||
| 259 | if (major > 6 || (major == 6 && minor == 0)) | ||
| 260 | sms_per_tpc = 2; | ||
| 261 | else | ||
| 262 | sms_per_tpc = 1; | ||
| 263 | // It looks like there may be some upcoming weirdness (TPCs with only one SM?) | ||
| 264 | // with Hopper | ||
| 265 | if (major >= 9) | ||
| 266 | fprintf(stderr, "libsmctrl: WARNING, SM masking is untested on Hopper, and will likely yield incorrect results! Proceed with caution.\n"); | ||
| 267 | *num_tpcs = num_sms/sms_per_tpc; | ||
| 268 | return 0; | ||
| 269 | } | ||
| 270 | 248 | ||
| 271 | // Read an integer from a file in `/proc` | 249 | // Read an integer from a file in `/proc` |
| 272 | static int read_int_procfile(char* filename, uint64_t* out) { | 250 | static int read_int_procfile(char* filename, uint64_t* out) { |
| 273 | char f_data[18] = {0}; | 251 | char f_data[18] = {0}; |
| 274 | int fd = open(filename, O_RDONLY); | 252 | int fd = open(filename, O_RDONLY); |
| 275 | if (fd == -1) | 253 | if (fd == -1) |
| 276 | return -errno; | 254 | return errno; |
| 277 | read(fd, f_data, 18); | 255 | read(fd, f_data, 18); |
| 278 | close(fd); | 256 | close(fd); |
| 279 | *out = strtoll(f_data, NULL, 16); | 257 | *out = strtoll(f_data, NULL, 16); |
| 280 | return 0; | 258 | return 0; |
| 281 | } | 259 | } |
| 282 | 260 | ||
| 261 | // We support up to 12 GPCs per GPU, and up to 16 GPUs. | ||
| 283 | static uint64_t tpc_mask_per_gpc_per_dev[16][12]; | 262 | static uint64_t tpc_mask_per_gpc_per_dev[16][12]; |
| 284 | // Output mask is vtpc-indexed (virtual TPC) | 263 | // Output mask is vtpc-indexed (virtual TPC) |
| 285 | int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) { | 264 | int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) { |
| @@ -291,13 +270,14 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, | |||
| 291 | // Maximum number of GPCs supported for this chip | 270 | // Maximum number of GPCs supported for this chip |
| 292 | snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev); | 271 | snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev); |
| 293 | if (err = read_int_procfile(filename, &max_gpcs)) { | 272 | if (err = read_int_procfile(filename, &max_gpcs)) { |
| 294 | fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before using libsmctrl_get_gpc_info()\n"); | 273 | fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before " |
| 274 | "using libsmctrl_get_*_info() functions\n"); | ||
| 295 | return err; | 275 | return err; |
| 296 | } | 276 | } |
| 297 | // TODO: handle arbitrary-size GPUs | 277 | // TODO: handle arbitrary-size GPUs |
| 298 | if (dev > 16 || max_gpcs > 12) { | 278 | if (dev > 16 || max_gpcs > 12) { |
| 299 | fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n"); | 279 | fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n"); |
| 300 | return -ERANGE; | 280 | return ERANGE; |
| 301 | } | 281 | } |
| 302 | // Set bit = disabled GPC | 282 | // Set bit = disabled GPC |
| 303 | snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev); | 283 | snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev); |
| @@ -331,3 +311,52 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, | |||
| 331 | return 0; | 311 | return 0; |
| 332 | } | 312 | } |
| 333 | 313 | ||
| 314 | int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { | ||
| 315 | uint32_t num_gpcs; | ||
| 316 | uint64_t* tpcs_per_gpc; | ||
| 317 | int res; | ||
| 318 | if (res = libsmctrl_get_gpc_info(&num_gpcs, &tpcs_per_gpc, dev)) | ||
| 319 | return res; | ||
| 320 | *num_tpcs = 0; | ||
| 321 | for (int gpc = 0; gpc < num_gpcs; gpc++) { | ||
| 322 | *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]); | ||
| 323 | } | ||
| 324 | return 0; | ||
| 325 | } | ||
| 326 | |||
| 327 | // @param dev Device index as understood by CUDA **can differ from nvdebug idx** | ||
| 328 | // This implementation is fragile, and could be incorrect for odd GPUs | ||
| 329 | int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) { | ||
| 330 | int num_sms, major, minor, res = 0; | ||
| 331 | const char* err_str; | ||
| 332 | if (res = cuInit(0)) | ||
| 333 | goto abort_cuda; | ||
| 334 | if (res = cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_dev)) | ||
| 335 | goto abort_cuda; | ||
| 336 | if (res = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_dev)) | ||
| 337 | goto abort_cuda; | ||
| 338 | if (res = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_dev)) | ||
| 339 | goto abort_cuda; | ||
| 340 | // SM masking only works on sm_35+ | ||
| 341 | if (major < 3 || (major == 3 && minor < 5)) | ||
| 342 | return ENOTSUP; | ||
| 343 | // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well | ||
| 344 | // as the P100, which is uniquely sm_60 | ||
| 345 | int sms_per_tpc; | ||
| 346 | if (major > 6 || (major == 6 && minor == 0)) | ||
| 347 | sms_per_tpc = 2; | ||
| 348 | else | ||
| 349 | sms_per_tpc = 1; | ||
| 350 | // It looks like there may be some upcoming weirdness (TPCs with only one SM?) | ||
| 351 | // with Hopper | ||
| 352 | if (major >= 9) | ||
| 353 | fprintf(stderr, "libsmctrl: WARNING, TPC masking is untested on Hopper," | ||
| 354 | " and will likely yield incorrect results! Proceed with caution.\n"); | ||
| 355 | *num_tpcs = num_sms/sms_per_tpc; | ||
| 356 | return 0; | ||
| 357 | abort_cuda: | ||
| 358 | cuGetErrorName(res, &err_str); | ||
| 359 | fprintf(stderr, "libsmctrl: CUDA call failed due to %s. Failing with EIO...\n", err_str); | ||
| 360 | return EIO; | ||
| 361 | } | ||
| 362 | |||
diff --git a/libsmctrl.h b/libsmctrl.h index f342afa..64ae7a7 100644 --- a/libsmctrl.h +++ b/libsmctrl.h | |||
| @@ -50,15 +50,21 @@ extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_ | |||
| 50 | 50 | ||
| 51 | /* INFORMATIONAL FUNCTIONS */ | 51 | /* INFORMATIONAL FUNCTIONS */ |
| 52 | 52 | ||
| 53 | // Get total number of TPCs on device number `dev`. | ||
| 54 | extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev); | ||
| 55 | // Get number of GPCs for devices number `dev`, and a GPC-indexed array | 53 | // Get number of GPCs for devices number `dev`, and a GPC-indexed array |
| 56 | // containing masks of which TPCs are associated with each GPC. | 54 | // containing masks of which TPCs are associated with each GPC. |
| 57 | // Note that the `nvdebug` module must be loaded to use this function. | 55 | // Note that the `nvdebug` module must be loaded to use this function. |
| 58 | // @param num_enabled_gpcs Location to store number of GPCs in | 56 | // @param num_enabled_gpcs (out) Location to store number of GPCs in |
| 59 | // @param tpcs_for_gpc Pointer to store pointer to output buffer at | 57 | // @param tpcs_for_gpc (out) Pointer to store pointer to output buffer at |
| 60 | // @return 0 on success, error code on error | 58 | // @param dev (in) `nvdebug` device ID |
| 59 | // @return 0 on success, `errno`-compatible error code on failure | ||
| 61 | extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev); | 60 | extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev); |
| 61 | // Get total number of TPCs on device number `dev`. Requires `nvdebug`. | ||
| 62 | // @param num_tpcs (out) Location to store number of TPCs at | ||
| 63 | // @param dev (in) `nvdebug` device ID | ||
| 64 | // @return 0 on success, `errno`-compatible error code on failure | ||
| 65 | extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev); | ||
| 66 | // Identical to above, but for a CUDA device ID. Does not require `nvdebug`. | ||
| 67 | extern int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev); | ||
| 62 | 68 | ||
| 63 | #ifdef __cplusplus | 69 | #ifdef __cplusplus |
| 64 | } | 70 | } |
diff --git a/pysmctrl/__init__.py b/pysmctrl/__init__.py new file mode 100644 index 0000000..5dc6175 --- /dev/null +++ b/pysmctrl/__init__.py | |||
| @@ -0,0 +1,82 @@ | |||
| 1 | import ctypes, ctypes.util | ||
| 2 | import os | ||
| 3 | |||
| 4 | # If this is failing, make sure that the directory containing libsmctrl.so is | ||
| 5 | # in your LD_LIBRARY_PATH environment variable. You likely need something like: | ||
| 6 | # LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/playpen/jbakita/gpu_subdiv/libsmctrl/ | ||
| 7 | libsmctrl_path = ctypes.util.find_library("libsmctrl") | ||
| 8 | if not libsmctrl_path: | ||
| 9 | libsmctrl_path = __path__[0] + "/../libsmctrl.so" | ||
| 10 | libsmctrl = ctypes.CDLL(libsmctrl_path) | ||
| 11 | |||
| 12 | def get_gpc_info(device_num): | ||
| 13 | """ | ||
| 14 | Obtain list of thread processing clusters (TPCs) enabled for each general | ||
| 15 | processing cluster (GPC) in the specified GPU. | ||
| 16 | |||
| 17 | Parameters | ||
| 18 | ---------- | ||
| 19 | device_num : int | ||
| 20 | Which device to obtain information for (starts as 0, order is defined | ||
| 21 | by nvdebug module). May not match CUDA device numbering. | ||
| 22 | |||
| 23 | Returns | ||
| 24 | ------- | ||
| 25 | list of int64 | ||
| 26 | A list as long as the number of GPCs enabled, where each list entry is | ||
| 27 | a bitmask. A bit set at index `i` indicates that TPC `i` is part of the | ||
| 28 | GPC at that list index. Obtained via GPU register reads in `nvdebug`. | ||
| 29 | """ | ||
| 30 | num_gpcs = ctypes.c_uint() | ||
| 31 | tpc_masks = ctypes.pointer(ctypes.c_ulonglong()) | ||
| 32 | res = libsmctrl.libsmctrl_get_gpc_info(ctypes.byref(num_gpcs), ctypes.byref(tpc_masks), device_num) | ||
| 33 | if res != 0: | ||
| 34 | print("pysmctrl: Unable to call libsmctrl_get_gpc_info(). Raising error %d..."%res) | ||
| 35 | raise OSError(res, os.strerror(res)) | ||
| 36 | return [tpc_masks[i] for i in range(num_gpcs.value)] | ||
| 37 | |||
| 38 | def get_tpc_info(device_num): | ||
| 39 | """ | ||
| 40 | Obtain a count of the total number of thread processing clusters (TPCs) | ||
| 41 | enabled on the specified GPU. | ||
| 42 | |||
| 43 | Parameters | ||
| 44 | ---------- | ||
| 45 | device_num : int | ||
| 46 | Which device to obtain TPC count for (starts as 0, order is defined by | ||
| 47 | `nvdebug` module). May not match CUDA device numbering. | ||
| 48 | |||
| 49 | Returns | ||
| 50 | ------- | ||
| 51 | int | ||
| 52 | Count of enabled TPCs. Obtained via GPU register reads in `nvdebug`. | ||
| 53 | """ | ||
| 54 | num_tpcs = ctypes.c_uint() | ||
| 55 | res = libsmctrl.libsmctrl_get_tpc_info(ctypes.byref(num_tpcs), device_num) | ||
| 56 | if res != 0: | ||
| 57 | print("pysmctrl: Unable to call libsmctrl_get_tpc_info(). Raising error %d..."%res) | ||
| 58 | raise OSError(res, os.strerror(res)) | ||
| 59 | return num_tpcs.value | ||
| 60 | |||
| 61 | def get_tpc_info_cuda(device_num): | ||
| 62 | """ | ||
| 63 | Obtain a count of the total number of thread processing clusters (TPCs) | ||
| 64 | enabled on the specified GPU. | ||
| 65 | |||
| 66 | Parameters | ||
| 67 | ---------- | ||
| 68 | device_num : int | ||
| 69 | Which device to obtain TPC count for, as a CUDA device ID. | ||
| 70 | |||
| 71 | Returns | ||
| 72 | ------- | ||
| 73 | int | ||
| 74 | Count of enabled TPCs. Obtained via calculations on data from CUDA. | ||
| 75 | """ | ||
| 76 | num_tpcs = ctypes.c_uint() | ||
| 77 | res = libsmctrl.libsmctrl_get_tpc_info_cuda(ctypes.byref(num_tpcs), device_num) | ||
| 78 | if res != 0: | ||
| 79 | print("pysmctrl: Unable to call libsmctrl_get_tpc_info_cuda(). Raising error %d..."%res) | ||
| 80 | raise OSError(res, os.strerror(res)) | ||
| 81 | return num_tpcs.value | ||
| 82 | |||
