aboutsummaryrefslogtreecommitdiffstats
path: root/pysmctrl
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2023-03-16 02:18:35 -0400
committerJoshua Bakita <bakitajoshua@gmail.com>2023-03-16 02:18:35 -0400
commit9ed721de0e9ce564b7c852e38359398b019a5c2f (patch)
tree8dc52ba6d8521741d2c1aa4030196e54670c2efc /pysmctrl
parentd827c6c152c8dd52463f82ef11ccdfc66083a9db (diff)
Introduce pysmctrl: A python interface to libsmctrl
Initially supports the GPU information functions via: - pysmctrl.get_gpc_info(dev_id) - pysmctrl.get_tpc_info(dev_id) - pysmctrl.get_tpc_info_cuda(cuda_dev_id) All functions are extensively documented. See pysmctrl/__init__.py for details. Device partitioning functions have yet to be mapped into Python, as these will require more testing. As part of this: - libsmctrl_get_*_info() functions have been modified to consistently return positive error codes. - libsmctrl_get_tpc_info() now uses nvdebug-style device numbering and uses libsmctrl_get_gpc_info() under the covers. This should be more reliable. - libsmctrl_get_tpc_info_cuda() has been introduced as an improved version of the old libsmctrl_get_tpc_info() function. This continues to use CUDA-style device numbering, but is now resiliant to CUDA failures. - Various minor style improvements in libsmctrl.c
Diffstat (limited to 'pysmctrl')
-rw-r--r--pysmctrl/__init__.py82
1 files changed, 82 insertions, 0 deletions
diff --git a/pysmctrl/__init__.py b/pysmctrl/__init__.py
new file mode 100644
index 0000000..5dc6175
--- /dev/null
+++ b/pysmctrl/__init__.py
@@ -0,0 +1,82 @@
1import ctypes, ctypes.util
2import os
3
4# If this is failing, make sure that the directory containing libsmctrl.so is
5# in your LD_LIBRARY_PATH environment variable. You likely need something like:
6# LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/playpen/jbakita/gpu_subdiv/libsmctrl/
7libsmctrl_path = ctypes.util.find_library("libsmctrl")
8if not libsmctrl_path:
9 libsmctrl_path = __path__[0] + "/../libsmctrl.so"
10libsmctrl = ctypes.CDLL(libsmctrl_path)
11
12def get_gpc_info(device_num):
13 """
14 Obtain list of thread processing clusters (TPCs) enabled for each general
15 processing cluster (GPC) in the specified GPU.
16
17 Parameters
18 ----------
19 device_num : int
20 Which device to obtain information for (starts as 0, order is defined
21 by nvdebug module). May not match CUDA device numbering.
22
23 Returns
24 -------
25 list of int64
26 A list as long as the number of GPCs enabled, where each list entry is
27 a bitmask. A bit set at index `i` indicates that TPC `i` is part of the
28 GPC at that list index. Obtained via GPU register reads in `nvdebug`.
29 """
30 num_gpcs = ctypes.c_uint()
31 tpc_masks = ctypes.pointer(ctypes.c_ulonglong())
32 res = libsmctrl.libsmctrl_get_gpc_info(ctypes.byref(num_gpcs), ctypes.byref(tpc_masks), device_num)
33 if res != 0:
34 print("pysmctrl: Unable to call libsmctrl_get_gpc_info(). Raising error %d..."%res)
35 raise OSError(res, os.strerror(res))
36 return [tpc_masks[i] for i in range(num_gpcs.value)]
37
38def get_tpc_info(device_num):
39 """
40 Obtain a count of the total number of thread processing clusters (TPCs)
41 enabled on the specified GPU.
42
43 Parameters
44 ----------
45 device_num : int
46 Which device to obtain TPC count for (starts as 0, order is defined by
47 `nvdebug` module). May not match CUDA device numbering.
48
49 Returns
50 -------
51 int
52 Count of enabled TPCs. Obtained via GPU register reads in `nvdebug`.
53 """
54 num_tpcs = ctypes.c_uint()
55 res = libsmctrl.libsmctrl_get_tpc_info(ctypes.byref(num_tpcs), device_num)
56 if res != 0:
57 print("pysmctrl: Unable to call libsmctrl_get_tpc_info(). Raising error %d..."%res)
58 raise OSError(res, os.strerror(res))
59 return num_tpcs.value
60
61def get_tpc_info_cuda(device_num):
62 """
63 Obtain a count of the total number of thread processing clusters (TPCs)
64 enabled on the specified GPU.
65
66 Parameters
67 ----------
68 device_num : int
69 Which device to obtain TPC count for, as a CUDA device ID.
70
71 Returns
72 -------
73 int
74 Count of enabled TPCs. Obtained via calculations on data from CUDA.
75 """
76 num_tpcs = ctypes.c_uint()
77 res = libsmctrl.libsmctrl_get_tpc_info_cuda(ctypes.byref(num_tpcs), device_num)
78 if res != 0:
79 print("pysmctrl: Unable to call libsmctrl_get_tpc_info_cuda(). Raising error %d..."%res)
80 raise OSError(res, os.strerror(res))
81 return num_tpcs.value
82