diff options
| author | Joshua Bakita <bakitajoshua@gmail.com> | 2023-03-16 02:18:35 -0400 |
|---|---|---|
| committer | Joshua Bakita <bakitajoshua@gmail.com> | 2023-03-16 02:18:35 -0400 |
| commit | 9ed721de0e9ce564b7c852e38359398b019a5c2f (patch) | |
| tree | 8dc52ba6d8521741d2c1aa4030196e54670c2efc /pysmctrl | |
| parent | d827c6c152c8dd52463f82ef11ccdfc66083a9db (diff) | |
Introduce pysmctrl: A python interface to libsmctrl
Initially supports the GPU information functions via:
- pysmctrl.get_gpc_info(dev_id)
- pysmctrl.get_tpc_info(dev_id)
- pysmctrl.get_tpc_info_cuda(cuda_dev_id)
All functions are extensively documented. See pysmctrl/__init__.py
for details.
Device partitioning functions have yet to be mapped into Python, as
these will require more testing.
As part of this:
- libsmctrl_get_*_info() functions have been modified to consistently
return positive error codes.
- libsmctrl_get_tpc_info() now uses nvdebug-style device numbering and
uses libsmctrl_get_gpc_info() under the covers. This should be more
reliable.
- libsmctrl_get_tpc_info_cuda() has been introduced as an improved
version of the old libsmctrl_get_tpc_info() function. This continues
to use CUDA-style device numbering, but is now resiliant to CUDA
failures.
- Various minor style improvements in libsmctrl.c
Diffstat (limited to 'pysmctrl')
| -rw-r--r-- | pysmctrl/__init__.py | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/pysmctrl/__init__.py b/pysmctrl/__init__.py new file mode 100644 index 0000000..5dc6175 --- /dev/null +++ b/pysmctrl/__init__.py | |||
| @@ -0,0 +1,82 @@ | |||
| 1 | import ctypes, ctypes.util | ||
| 2 | import os | ||
| 3 | |||
| 4 | # If this is failing, make sure that the directory containing libsmctrl.so is | ||
| 5 | # in your LD_LIBRARY_PATH environment variable. You likely need something like: | ||
| 6 | # LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/playpen/jbakita/gpu_subdiv/libsmctrl/ | ||
| 7 | libsmctrl_path = ctypes.util.find_library("libsmctrl") | ||
| 8 | if not libsmctrl_path: | ||
| 9 | libsmctrl_path = __path__[0] + "/../libsmctrl.so" | ||
| 10 | libsmctrl = ctypes.CDLL(libsmctrl_path) | ||
| 11 | |||
| 12 | def get_gpc_info(device_num): | ||
| 13 | """ | ||
| 14 | Obtain list of thread processing clusters (TPCs) enabled for each general | ||
| 15 | processing cluster (GPC) in the specified GPU. | ||
| 16 | |||
| 17 | Parameters | ||
| 18 | ---------- | ||
| 19 | device_num : int | ||
| 20 | Which device to obtain information for (starts as 0, order is defined | ||
| 21 | by nvdebug module). May not match CUDA device numbering. | ||
| 22 | |||
| 23 | Returns | ||
| 24 | ------- | ||
| 25 | list of int64 | ||
| 26 | A list as long as the number of GPCs enabled, where each list entry is | ||
| 27 | a bitmask. A bit set at index `i` indicates that TPC `i` is part of the | ||
| 28 | GPC at that list index. Obtained via GPU register reads in `nvdebug`. | ||
| 29 | """ | ||
| 30 | num_gpcs = ctypes.c_uint() | ||
| 31 | tpc_masks = ctypes.pointer(ctypes.c_ulonglong()) | ||
| 32 | res = libsmctrl.libsmctrl_get_gpc_info(ctypes.byref(num_gpcs), ctypes.byref(tpc_masks), device_num) | ||
| 33 | if res != 0: | ||
| 34 | print("pysmctrl: Unable to call libsmctrl_get_gpc_info(). Raising error %d..."%res) | ||
| 35 | raise OSError(res, os.strerror(res)) | ||
| 36 | return [tpc_masks[i] for i in range(num_gpcs.value)] | ||
| 37 | |||
| 38 | def get_tpc_info(device_num): | ||
| 39 | """ | ||
| 40 | Obtain a count of the total number of thread processing clusters (TPCs) | ||
| 41 | enabled on the specified GPU. | ||
| 42 | |||
| 43 | Parameters | ||
| 44 | ---------- | ||
| 45 | device_num : int | ||
| 46 | Which device to obtain TPC count for (starts as 0, order is defined by | ||
| 47 | `nvdebug` module). May not match CUDA device numbering. | ||
| 48 | |||
| 49 | Returns | ||
| 50 | ------- | ||
| 51 | int | ||
| 52 | Count of enabled TPCs. Obtained via GPU register reads in `nvdebug`. | ||
| 53 | """ | ||
| 54 | num_tpcs = ctypes.c_uint() | ||
| 55 | res = libsmctrl.libsmctrl_get_tpc_info(ctypes.byref(num_tpcs), device_num) | ||
| 56 | if res != 0: | ||
| 57 | print("pysmctrl: Unable to call libsmctrl_get_tpc_info(). Raising error %d..."%res) | ||
| 58 | raise OSError(res, os.strerror(res)) | ||
| 59 | return num_tpcs.value | ||
| 60 | |||
| 61 | def get_tpc_info_cuda(device_num): | ||
| 62 | """ | ||
| 63 | Obtain a count of the total number of thread processing clusters (TPCs) | ||
| 64 | enabled on the specified GPU. | ||
| 65 | |||
| 66 | Parameters | ||
| 67 | ---------- | ||
| 68 | device_num : int | ||
| 69 | Which device to obtain TPC count for, as a CUDA device ID. | ||
| 70 | |||
| 71 | Returns | ||
| 72 | ------- | ||
| 73 | int | ||
| 74 | Count of enabled TPCs. Obtained via calculations on data from CUDA. | ||
| 75 | """ | ||
| 76 | num_tpcs = ctypes.c_uint() | ||
| 77 | res = libsmctrl.libsmctrl_get_tpc_info_cuda(ctypes.byref(num_tpcs), device_num) | ||
| 78 | if res != 0: | ||
| 79 | print("pysmctrl: Unable to call libsmctrl_get_tpc_info_cuda(). Raising error %d..."%res) | ||
| 80 | raise OSError(res, os.strerror(res)) | ||
| 81 | return num_tpcs.value | ||
| 82 | |||
