Introduce pysmctrl: A python interface to libsmctrl

Initially supports the GPU information functions via: - pysmctrl.get_gpc_info(dev_id) - pysmctrl.get_tpc_info(dev_id) - pysmctrl.get_tpc_info_cuda(cuda_dev_id) All functions are extensively documented. See pysmctrl/__init__.py for details. Device partitioning functions have yet to be mapped into Python, as these will require more testing. As part of this: - libsmctrl_get_*_info() functions have been modified to consistently return positive error codes. - libsmctrl_get_tpc_info() now uses nvdebug-style device numbering and uses libsmctrl_get_gpc_info() under the covers. This should be more reliable. - libsmctrl_get_tpc_info_cuda() has been introduced as an improved version of the old libsmctrl_get_tpc_info() function. This continues to use CUDA-style device numbering, but is now resiliant to CUDA failures. - Various minor style improvements in libsmctrl.c
author: Joshua Bakita <bakitajoshua@gmail.com> 2023-03-16 02:18:35 -0400
committer: Joshua Bakita <bakitajoshua@gmail.com> 2023-03-16 02:18:35 -0400
commit: 9ed721de0e9ce564b7c852e38359398b019a5c2f (patch)
tree: 8dc52ba6d8521741d2c1aa4030196e54670c2efc /pysmctrl
parent: d827c6c152c8dd52463f82ef11ccdfc66083a9db (diff)
1 files changed, 82 insertions, 0 deletions
diff --git a/pysmctrl/__init__.py b/pysmctrl/__init__.py
new file mode 100644
index 0000000..5dc6175
--- /dev/null
+++ b/pysmctrl/__init__.py
@@ -0,0 +1,82 @@
+import ctypes, ctypes.util
+import os
+# If this is failing, make sure that the directory containing libsmctrl.so is
+# in your LD_LIBRARY_PATH environment variable. You likely need something like:
+# LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/playpen/jbakita/gpu_subdiv/libsmctrl/
+libsmctrl_path = ctypes.util.find_library("libsmctrl")
+if not libsmctrl_path:
+    libsmctrl_path = __path__[0] + "/../libsmctrl.so"
+libsmctrl = ctypes.CDLL(libsmctrl_path)
+def get_gpc_info(device_num):
+    """
+    Obtain list of thread processing clusters (TPCs) enabled for each general
+    processing cluster (GPC) in the specified GPU.
+    Parameters
+    ----------
+    device_num : int
+        Which device to obtain information for (starts as 0, order is defined
+        by nvdebug module). May not match CUDA device numbering.
+    Returns
+    -------
+    list of int64
+        A list as long as the number of GPCs enabled, where each list entry is
+        a bitmask. A bit set at index `i` indicates that TPC `i` is part of the
+        GPC at that list index. Obtained via GPU register reads in `nvdebug`.
+    """
+    num_gpcs = ctypes.c_uint()
+    tpc_masks = ctypes.pointer(ctypes.c_ulonglong())
+    res = libsmctrl.libsmctrl_get_gpc_info(ctypes.byref(num_gpcs), ctypes.byref(tpc_masks), device_num)
+    if res != 0:
+        print("pysmctrl: Unable to call libsmctrl_get_gpc_info(). Raising error %d..."%res)
+        raise OSError(res, os.strerror(res))
+    return [tpc_masks[i] for i in range(num_gpcs.value)]
+def get_tpc_info(device_num):
+    """
+    Obtain a count of the total number of thread processing clusters (TPCs)
+    enabled on the specified GPU.
+    Parameters
+    ----------
+    device_num : int
+        Which device to obtain TPC count for (starts as 0, order is defined by
+        `nvdebug` module). May not match CUDA device numbering.
+    Returns
+    -------
+    int
+        Count of enabled TPCs. Obtained via GPU register reads in `nvdebug`.
+    """
+    num_tpcs = ctypes.c_uint()
+    res = libsmctrl.libsmctrl_get_tpc_info(ctypes.byref(num_tpcs), device_num)
+    if res != 0:
+        print("pysmctrl: Unable to call libsmctrl_get_tpc_info(). Raising error %d..."%res)
+        raise OSError(res, os.strerror(res))
+    return num_tpcs.value
+def get_tpc_info_cuda(device_num):
+    """
+    Obtain a count of the total number of thread processing clusters (TPCs)
+    enabled on the specified GPU.
+    Parameters
+    ----------
+    device_num : int
+        Which device to obtain TPC count for, as a CUDA device ID.
+    Returns
+    -------
+    int
+        Count of enabled TPCs. Obtained via calculations on data from CUDA.
+    """
+    num_tpcs = ctypes.c_uint()
+    res = libsmctrl.libsmctrl_get_tpc_info_cuda(ctypes.byref(num_tpcs), device_num)
+    if res != 0:
+        print("pysmctrl: Unable to call libsmctrl_get_tpc_info_cuda(). Raising error %d..."%res)
+        raise OSError(res, os.strerror(res))
+    return num_tpcs.value
author	Joshua Bakita <bakitajoshua@gmail.com>	2023-03-16 02:18:35 -0400
committer	Joshua Bakita <bakitajoshua@gmail.com>	2023-03-16 02:18:35 -0400
commit	9ed721de0e9ce564b7c852e38359398b019a5c2f (patch)
tree	8dc52ba6d8521741d2c1aa4030196e54670c2efc /pysmctrl
parent	d827c6c152c8dd52463f82ef11ccdfc66083a9db (diff)

diff --git a/pysmctrl/__init__.py b/pysmctrl/__init__.py new file mode 100644 index 0000000..5dc6175 --- /dev/null +++ b/pysmctrl/__init__.py
@@ -0,0 +1,82 @@
	1	import ctypes, ctypes.util
	2	import os
	3
	4	# If this is failing, make sure that the directory containing libsmctrl.so is
	5	# in your LD_LIBRARY_PATH environment variable. You likely need something like:
	6	# LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/playpen/jbakita/gpu_subdiv/libsmctrl/
	7	libsmctrl_path = ctypes.util.find_library("libsmctrl")
	8	if not libsmctrl_path:
	9	libsmctrl_path = __path__[0] + "/../libsmctrl.so"
	10	libsmctrl = ctypes.CDLL(libsmctrl_path)
	11
	12	def get_gpc_info(device_num):
	13	"""
	14	Obtain list of thread processing clusters (TPCs) enabled for each general
	15	processing cluster (GPC) in the specified GPU.
	16
	17	Parameters
	18	----------
	19	device_num : int
	20	Which device to obtain information for (starts as 0, order is defined
	21	by nvdebug module). May not match CUDA device numbering.
	22
	23	Returns
	24	-------
	25	list of int64
	26	A list as long as the number of GPCs enabled, where each list entry is
	27	a bitmask. A bit set at index `i` indicates that TPC `i` is part of the
	28	GPC at that list index. Obtained via GPU register reads in `nvdebug`.
	29	"""
	30	num_gpcs = ctypes.c_uint()
	31	tpc_masks = ctypes.pointer(ctypes.c_ulonglong())
	32	res = libsmctrl.libsmctrl_get_gpc_info(ctypes.byref(num_gpcs), ctypes.byref(tpc_masks), device_num)
	33	if res != 0:
	34	print("pysmctrl: Unable to call libsmctrl_get_gpc_info(). Raising error %d..."%res)
	35	raise OSError(res, os.strerror(res))
	36	return [tpc_masks[i] for i in range(num_gpcs.value)]
	37
	38	def get_tpc_info(device_num):
	39	"""
	40	Obtain a count of the total number of thread processing clusters (TPCs)
	41	enabled on the specified GPU.
	42
	43	Parameters
	44	----------
	45	device_num : int
	46	Which device to obtain TPC count for (starts as 0, order is defined by
	47	`nvdebug` module). May not match CUDA device numbering.
	48
	49	Returns
	50	-------
	51	int
	52	Count of enabled TPCs. Obtained via GPU register reads in `nvdebug`.
	53	"""
	54	num_tpcs = ctypes.c_uint()
	55	res = libsmctrl.libsmctrl_get_tpc_info(ctypes.byref(num_tpcs), device_num)
	56	if res != 0:
	57	print("pysmctrl: Unable to call libsmctrl_get_tpc_info(). Raising error %d..."%res)
	58	raise OSError(res, os.strerror(res))
	59	return num_tpcs.value
	60
	61	def get_tpc_info_cuda(device_num):
	62	"""
	63	Obtain a count of the total number of thread processing clusters (TPCs)
	64	enabled on the specified GPU.
	65
	66	Parameters
	67	----------
	68	device_num : int
	69	Which device to obtain TPC count for, as a CUDA device ID.
	70
	71	Returns
	72	-------
	73	int
	74	Count of enabled TPCs. Obtained via calculations on data from CUDA.
	75	"""
	76	num_tpcs = ctypes.c_uint()
	77	res = libsmctrl.libsmctrl_get_tpc_info_cuda(ctypes.byref(num_tpcs), device_num)
	78	if res != 0:
	79	print("pysmctrl: Unable to call libsmctrl_get_tpc_info_cuda(). Raising error %d..."%res)
	80	raise OSError(res, os.strerror(res))
	81	return num_tpcs.value
	82