Introduce pysmctrl: A python interface to libsmctrl

Initially supports the GPU information functions via: - pysmctrl.get_gpc_info(dev_id) - pysmctrl.get_tpc_info(dev_id) - pysmctrl.get_tpc_info_cuda(cuda_dev_id) All functions are extensively documented. See pysmctrl/__init__.py for details. Device partitioning functions have yet to be mapped into Python, as these will require more testing. As part of this: - libsmctrl_get_*_info() functions have been modified to consistently return positive error codes. - libsmctrl_get_tpc_info() now uses nvdebug-style device numbering and uses libsmctrl_get_gpc_info() under the covers. This should be more reliable. - libsmctrl_get_tpc_info_cuda() has been introduced as an improved version of the old libsmctrl_get_tpc_info() function. This continues to use CUDA-style device numbering, but is now resiliant to CUDA failures. - Various minor style improvements in libsmctrl.c
author: Joshua Bakita <bakitajoshua@gmail.com> 2023-03-16 02:18:35 -0400
committer: Joshua Bakita <bakitajoshua@gmail.com> 2023-03-16 02:18:35 -0400
commit: 9ed721de0e9ce564b7c852e38359398b019a5c2f (patch)
tree: 8dc52ba6d8521741d2c1aa4030196e54670c2efc
parent: d827c6c152c8dd52463f82ef11ccdfc66083a9db (diff)
4 files changed, 154 insertions, 36 deletions
diff --git a/.gitignore b/.gitignore
index dcff266..437f923 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ libsmctrl.a
 libsmctrl.o
 libsmctrl.so
 libsmctrl_test_gpc_info
+*.pyc
diff --git a/libsmctrl.c b/libsmctrl.c
index 640001a..98be1ef 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -226,13 +226,15 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF);
                break;
        default: {
-                // For experimenting to determine the right mask offset, set MASK_OFF (positive and negative numbers supported)
+                // For experimenting to determine the right mask offset, set the MASK_OFF
+                // environment variable (positive and negative numbers are supported)
                char* mask_off_str = getenv("MASK_OFF");
                fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver);
                if (mask_off_str) {
                        int off = atoi(mask_off_str);
-                        fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 11.8 base %#x (total off: %#x)\n", off, CU_11_8_MASK_OFF, CU_11_8_MASK_OFF+off);
+                        fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x "
-                        hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF + off);
+                                        "(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off);
+                        hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off);
                } else {
                        return;
                }}
@@ -242,44 +244,21 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
        hw_mask->lower = mask;
 }
-int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {
+/* INFORMATIONAL FUNCTIONS */
-        int num_sms;
-        int major;
-        int minor;
-        // TODO: Use nvdebug instead of this hardcoded hack
-        cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
-        cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
-        cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
-        // SM masking only works on sm_35+
-        if (major < 3 || (major == 3 && minor < 5))
-                return -ENOTSUP;
-        // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
-        // as the P100, which is uniquely sm_60
-        int sms_per_tpc;
-        if (major > 6 || (major == 6 && minor == 0))
-                sms_per_tpc = 2;
-        else
-                sms_per_tpc = 1;
-        // It looks like there may be some upcoming weirdness (TPCs with only one SM?)
-        // with Hopper
-        if (major >= 9)
-                fprintf(stderr, "libsmctrl: WARNING, SM masking is untested on Hopper, and will likely yield incorrect results! Proceed with caution.\n");
-        *num_tpcs = num_sms/sms_per_tpc;
-        return 0;
-}
 // Read an integer from a file in `/proc`
 static int read_int_procfile(char* filename, uint64_t* out) {
        char f_data[18] = {0};
        int fd = open(filename, O_RDONLY);
        if (fd == -1)
-                return -errno;
+                return errno;
        read(fd, f_data, 18);
        close(fd);
        *out = strtoll(f_data, NULL, 16);
        return 0;
 }
+// We support up to 12 GPCs per GPU, and up to 16 GPUs.
 static uint64_t tpc_mask_per_gpc_per_dev[16][12];
 // Output mask is vtpc-indexed (virtual TPC)
 int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) {
@@ -291,13 +270,14 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc,
        // Maximum number of GPCs supported for this chip
        snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev);
        if (err = read_int_procfile(filename, &max_gpcs)) {
-                fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before using libsmctrl_get_gpc_info()\n");
+                fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before "
+                                "using libsmctrl_get_*_info() functions\n");
                return err;
        }
        // TODO: handle arbitrary-size GPUs
        if (dev > 16 || max_gpcs > 12) {
                fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n");
-                return -ERANGE;
+                return ERANGE;
        }
        // Set bit = disabled GPC
        snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev);
@@ -331,3 +311,52 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc,
        return 0;
 }
+int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {
+        uint32_t num_gpcs;
+        uint64_t* tpcs_per_gpc;
+        int res;
+        if (res = libsmctrl_get_gpc_info(&num_gpcs, &tpcs_per_gpc, dev))
+                return res;
+        *num_tpcs = 0;
+        for (int gpc = 0; gpc < num_gpcs; gpc++) {
+                *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]);
+        }
+        return 0;
+}
+// @param dev Device index as understood by CUDA **can differ from nvdebug idx**
+// This implementation is fragile, and could be incorrect for odd GPUs
+int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) {
+        int num_sms, major, minor, res = 0;
+        const char* err_str;
+        if (res = cuInit(0))
+                goto abort_cuda;
+        if (res = cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_dev))
+                goto abort_cuda;
+        if (res = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_dev))
+                goto abort_cuda;
+        if (res = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_dev))
+                goto abort_cuda;
+        // SM masking only works on sm_35+
+        if (major < 3 || (major == 3 && minor < 5))
+                return ENOTSUP;
+        // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
+        // as the P100, which is uniquely sm_60
+        int sms_per_tpc;
+        if (major > 6 || (major == 6 && minor == 0))
+                sms_per_tpc = 2;
+        else
+                sms_per_tpc = 1;
+        // It looks like there may be some upcoming weirdness (TPCs with only one SM?)
+        // with Hopper
+        if (major >= 9)
+                fprintf(stderr, "libsmctrl: WARNING, TPC masking is untested on Hopper,"
+                                " and will likely yield incorrect results! Proceed with caution.\n");
+        *num_tpcs = num_sms/sms_per_tpc;
+        return 0;
+abort_cuda:
+        cuGetErrorName(res, &err_str);
+        fprintf(stderr, "libsmctrl: CUDA call failed due to %s. Failing with EIO...\n", err_str);
+        return EIO;
+}
diff --git a/libsmctrl.h b/libsmctrl.h
index f342afa..64ae7a7 100644
--- a/libsmctrl.h
+++ b/libsmctrl.h
@@ -50,15 +50,21 @@ extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_
 /* INFORMATIONAL FUNCTIONS */
-// Get total number of TPCs on device number `dev`.
-extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev);
 // Get number of GPCs for devices number `dev`, and a GPC-indexed array
 // containing masks of which TPCs are associated with each GPC.
 // Note that the `nvdebug` module must be loaded to use this function.
-// @param  num_enabled_gpcs Location to store number of GPCs in
+// @param  num_enabled_gpcs (out) Location to store number of GPCs in
-// @param  tpcs_for_gpc     Pointer to store pointer to output buffer at
+// @param  tpcs_for_gpc     (out) Pointer to store pointer to output buffer at
-// @return 0 on success, error code on error
+// @param  dev               (in) `nvdebug` device ID
+// @return 0 on success, `errno`-compatible error code on failure
 extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev);
+// Get total number of TPCs on device number `dev`. Requires `nvdebug`.
+// @param  num_tpcs        (out) Location to store number of TPCs at
+// @param  dev              (in) `nvdebug` device ID
+// @return 0 on success, `errno`-compatible error code on failure
+extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev);
+// Identical to above, but for a CUDA device ID. Does not require `nvdebug`.
+extern int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev);
 #ifdef __cplusplus
 }
diff --git a/pysmctrl/__init__.py b/pysmctrl/__init__.py
new file mode 100644
index 0000000..5dc6175
--- /dev/null
+++ b/pysmctrl/__init__.py
@@ -0,0 +1,82 @@
+import ctypes, ctypes.util
+import os
+# If this is failing, make sure that the directory containing libsmctrl.so is
+# in your LD_LIBRARY_PATH environment variable. You likely need something like:
+# LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/playpen/jbakita/gpu_subdiv/libsmctrl/
+libsmctrl_path = ctypes.util.find_library("libsmctrl")
+if not libsmctrl_path:
+    libsmctrl_path = __path__[0] + "/../libsmctrl.so"
+libsmctrl = ctypes.CDLL(libsmctrl_path)
+def get_gpc_info(device_num):
+    """
+    Obtain list of thread processing clusters (TPCs) enabled for each general
+    processing cluster (GPC) in the specified GPU.
+    Parameters
+    ----------
+    device_num : int
+        Which device to obtain information for (starts as 0, order is defined
+        by nvdebug module). May not match CUDA device numbering.
+    Returns
+    -------
+    list of int64
+        A list as long as the number of GPCs enabled, where each list entry is
+        a bitmask. A bit set at index `i` indicates that TPC `i` is part of the
+        GPC at that list index. Obtained via GPU register reads in `nvdebug`.
+    """
+    num_gpcs = ctypes.c_uint()
+    tpc_masks = ctypes.pointer(ctypes.c_ulonglong())
+    res = libsmctrl.libsmctrl_get_gpc_info(ctypes.byref(num_gpcs), ctypes.byref(tpc_masks), device_num)
+    if res != 0:
+        print("pysmctrl: Unable to call libsmctrl_get_gpc_info(). Raising error %d..."%res)
+        raise OSError(res, os.strerror(res))
+    return [tpc_masks[i] for i in range(num_gpcs.value)]
+def get_tpc_info(device_num):
+    """
+    Obtain a count of the total number of thread processing clusters (TPCs)
+    enabled on the specified GPU.
+    Parameters
+    ----------
+    device_num : int
+        Which device to obtain TPC count for (starts as 0, order is defined by
+        `nvdebug` module). May not match CUDA device numbering.
+    Returns
+    -------
+    int
+        Count of enabled TPCs. Obtained via GPU register reads in `nvdebug`.
+    """
+    num_tpcs = ctypes.c_uint()
+    res = libsmctrl.libsmctrl_get_tpc_info(ctypes.byref(num_tpcs), device_num)
+    if res != 0:
+        print("pysmctrl: Unable to call libsmctrl_get_tpc_info(). Raising error %d..."%res)
+        raise OSError(res, os.strerror(res))
+    return num_tpcs.value
+def get_tpc_info_cuda(device_num):
+    """
+    Obtain a count of the total number of thread processing clusters (TPCs)
+    enabled on the specified GPU.
+    Parameters
+    ----------
+    device_num : int
+        Which device to obtain TPC count for, as a CUDA device ID.
+    Returns
+    -------
+    int
+        Count of enabled TPCs. Obtained via calculations on data from CUDA.
+    """
+    num_tpcs = ctypes.c_uint()
+    res = libsmctrl.libsmctrl_get_tpc_info_cuda(ctypes.byref(num_tpcs), device_num)
+    if res != 0:
+        print("pysmctrl: Unable to call libsmctrl_get_tpc_info_cuda(). Raising error %d..."%res)
+        raise OSError(res, os.strerror(res))
+    return num_tpcs.value
author	Joshua Bakita <bakitajoshua@gmail.com>	2023-03-16 02:18:35 -0400
committer	Joshua Bakita <bakitajoshua@gmail.com>	2023-03-16 02:18:35 -0400
commit	9ed721de0e9ce564b7c852e38359398b019a5c2f (patch)
tree	8dc52ba6d8521741d2c1aa4030196e54670c2efc
parent	d827c6c152c8dd52463f82ef11ccdfc66083a9db (diff)

diff --git a/.gitignore b/.gitignore index dcff266..437f923 100644 --- a/.gitignore +++ b/.gitignore
@@ -2,3 +2,4 @@ libsmctrl.a
2	libsmctrl.o	2	libsmctrl.o
3	libsmctrl.so	3	libsmctrl.so
4	libsmctrl_test_gpc_info	4	libsmctrl_test_gpc_info
		5	*.pyc


diff --git a/libsmctrl.c b/libsmctrl.c index 640001a..98be1ef 100644 --- a/libsmctrl.c +++ b/libsmctrl.c
@@ -226,13 +226,15 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
226	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF);	226	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF);
227	break;	227	break;
228	default: {	228	default: {
229	// For experimenting to determine the right mask offset, set MASK_OFF (positive and negative numbers supported)	229	// For experimenting to determine the right mask offset, set the MASK_OFF
		230	// environment variable (positive and negative numbers are supported)
230	char* mask_off_str = getenv("MASK_OFF");	231	char* mask_off_str = getenv("MASK_OFF");
231	fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver);	232	fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver);
232	if (mask_off_str) {	233	if (mask_off_str) {
233	int off = atoi(mask_off_str);	234	int off = atoi(mask_off_str);
234	fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 11.8 base %#x (total off: %#x)\n", off, CU_11_8_MASK_OFF, CU_11_8_MASK_OFF+off);	235	fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x "
235	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF + off);	236	"(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off);
		237	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off);
236	} else {	238	} else {
237	return;	239	return;
238	}}	240	}}
@@ -242,44 +244,21 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
242	hw_mask->lower = mask;	244	hw_mask->lower = mask;
243	}	245	}
244		246
245	int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {	247	/* INFORMATIONAL FUNCTIONS */
246	int num_sms;
247	int major;
248	int minor;
249	// TODO: Use nvdebug instead of this hardcoded hack
250	cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
251	cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
252	cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
253	// SM masking only works on sm_35+
254	if (major < 3 \|\| (major == 3 && minor < 5))
255	return -ENOTSUP;
256	// Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
257	// as the P100, which is uniquely sm_60
258	int sms_per_tpc;
259	if (major > 6 \|\| (major == 6 && minor == 0))
260	sms_per_tpc = 2;
261	else
262	sms_per_tpc = 1;
263	// It looks like there may be some upcoming weirdness (TPCs with only one SM?)
264	// with Hopper
265	if (major >= 9)
266	fprintf(stderr, "libsmctrl: WARNING, SM masking is untested on Hopper, and will likely yield incorrect results! Proceed with caution.\n");
267	*num_tpcs = num_sms/sms_per_tpc;
268	return 0;
269	}
270		248
271	// Read an integer from a file in `/proc`	249	// Read an integer from a file in `/proc`
272	static int read_int_procfile(char* filename, uint64_t* out) {	250	static int read_int_procfile(char* filename, uint64_t* out) {
273	char f_data[18] = {0};	251	char f_data[18] = {0};
274	int fd = open(filename, O_RDONLY);	252	int fd = open(filename, O_RDONLY);
275	if (fd == -1)	253	if (fd == -1)
276	return -errno;	254	return errno;
277	read(fd, f_data, 18);	255	read(fd, f_data, 18);
278	close(fd);	256	close(fd);
279	*out = strtoll(f_data, NULL, 16);	257	*out = strtoll(f_data, NULL, 16);
280	return 0;	258	return 0;
281	}	259	}
282		260
		261	// We support up to 12 GPCs per GPU, and up to 16 GPUs.
283	static uint64_t tpc_mask_per_gpc_per_dev[16][12];	262	static uint64_t tpc_mask_per_gpc_per_dev[16][12];
284	// Output mask is vtpc-indexed (virtual TPC)	263	// Output mask is vtpc-indexed (virtual TPC)
285	int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) {	264	int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) {
@@ -291,13 +270,14 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc,
291	// Maximum number of GPCs supported for this chip	270	// Maximum number of GPCs supported for this chip
292	snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev);	271	snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev);
293	if (err = read_int_procfile(filename, &max_gpcs)) {	272	if (err = read_int_procfile(filename, &max_gpcs)) {
294	fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before using libsmctrl_get_gpc_info()\n");	273	fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before "
		274	"using libsmctrl_get_*_info() functions\n");
295	return err;	275	return err;
296	}	276	}
297	// TODO: handle arbitrary-size GPUs	277	// TODO: handle arbitrary-size GPUs
298	if (dev > 16 \|\| max_gpcs > 12) {	278	if (dev > 16 \|\| max_gpcs > 12) {
299	fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n");	279	fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n");
300	return -ERANGE;	280	return ERANGE;
301	}	281	}
302	// Set bit = disabled GPC	282	// Set bit = disabled GPC
303	snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev);	283	snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev);
@@ -331,3 +311,52 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc,
331	return 0;	311	return 0;
332	}	312	}
333		313
		314	int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {
		315	uint32_t num_gpcs;
		316	uint64_t* tpcs_per_gpc;
		317	int res;
		318	if (res = libsmctrl_get_gpc_info(&num_gpcs, &tpcs_per_gpc, dev))
		319	return res;
		320	*num_tpcs = 0;
		321	for (int gpc = 0; gpc < num_gpcs; gpc++) {
		322	*num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]);
		323	}
		324	return 0;
		325	}
		326
		327	// @param dev Device index as understood by CUDA can differ from nvdebug idx
		328	// This implementation is fragile, and could be incorrect for odd GPUs
		329	int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) {
		330	int num_sms, major, minor, res = 0;
		331	const char* err_str;
		332	if (res = cuInit(0))
		333	goto abort_cuda;
		334	if (res = cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_dev))
		335	goto abort_cuda;
		336	if (res = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_dev))
		337	goto abort_cuda;
		338	if (res = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_dev))
		339	goto abort_cuda;
		340	// SM masking only works on sm_35+
		341	if (major < 3 \|\| (major == 3 && minor < 5))
		342	return ENOTSUP;
		343	// Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
		344	// as the P100, which is uniquely sm_60
		345	int sms_per_tpc;
		346	if (major > 6 \|\| (major == 6 && minor == 0))
		347	sms_per_tpc = 2;
		348	else
		349	sms_per_tpc = 1;
		350	// It looks like there may be some upcoming weirdness (TPCs with only one SM?)
		351	// with Hopper
		352	if (major >= 9)
		353	fprintf(stderr, "libsmctrl: WARNING, TPC masking is untested on Hopper,"
		354	" and will likely yield incorrect results! Proceed with caution.\n");
		355	*num_tpcs = num_sms/sms_per_tpc;
		356	return 0;
		357	abort_cuda:
		358	cuGetErrorName(res, &err_str);
		359	fprintf(stderr, "libsmctrl: CUDA call failed due to %s. Failing with EIO...\n", err_str);
		360	return EIO;
		361	}
		362


diff --git a/libsmctrl.h b/libsmctrl.h index f342afa..64ae7a7 100644 --- a/libsmctrl.h +++ b/libsmctrl.h
@@ -50,15 +50,21 @@ extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_
50		50
51	/* INFORMATIONAL FUNCTIONS */	51	/* INFORMATIONAL FUNCTIONS */
52		52
53	// Get total number of TPCs on device number `dev`.
54	extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev);
55	// Get number of GPCs for devices number `dev`, and a GPC-indexed array	53	// Get number of GPCs for devices number `dev`, and a GPC-indexed array
56	// containing masks of which TPCs are associated with each GPC.	54	// containing masks of which TPCs are associated with each GPC.
57	// Note that the `nvdebug` module must be loaded to use this function.	55	// Note that the `nvdebug` module must be loaded to use this function.
58	// @param num_enabled_gpcs Location to store number of GPCs in	56	// @param num_enabled_gpcs (out) Location to store number of GPCs in
59	// @param tpcs_for_gpc Pointer to store pointer to output buffer at	57	// @param tpcs_for_gpc (out) Pointer to store pointer to output buffer at
60	// @return 0 on success, error code on error	58	// @param dev (in) `nvdebug` device ID
		59	// @return 0 on success, `errno`-compatible error code on failure
61	extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev);	60	extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev);
		61	// Get total number of TPCs on device number `dev`. Requires `nvdebug`.
		62	// @param num_tpcs (out) Location to store number of TPCs at
		63	// @param dev (in) `nvdebug` device ID
		64	// @return 0 on success, `errno`-compatible error code on failure
		65	extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev);
		66	// Identical to above, but for a CUDA device ID. Does not require `nvdebug`.
		67	extern int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev);
62		68
63	#ifdef __cplusplus	69	#ifdef __cplusplus
64	}	70	}


diff --git a/pysmctrl/__init__.py b/pysmctrl/__init__.py new file mode 100644 index 0000000..5dc6175 --- /dev/null +++ b/pysmctrl/__init__.py
@@ -0,0 +1,82 @@
		1	import ctypes, ctypes.util
		2	import os
		3
		4	# If this is failing, make sure that the directory containing libsmctrl.so is
		5	# in your LD_LIBRARY_PATH environment variable. You likely need something like:
		6	# LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/playpen/jbakita/gpu_subdiv/libsmctrl/
		7	libsmctrl_path = ctypes.util.find_library("libsmctrl")
		8	if not libsmctrl_path:
		9	libsmctrl_path = __path__[0] + "/../libsmctrl.so"
		10	libsmctrl = ctypes.CDLL(libsmctrl_path)
		11
		12	def get_gpc_info(device_num):
		13	"""
		14	Obtain list of thread processing clusters (TPCs) enabled for each general
		15	processing cluster (GPC) in the specified GPU.
		16
		17	Parameters
		18	----------
		19	device_num : int
		20	Which device to obtain information for (starts as 0, order is defined
		21	by nvdebug module). May not match CUDA device numbering.
		22
		23	Returns
		24	-------
		25	list of int64
		26	A list as long as the number of GPCs enabled, where each list entry is
		27	a bitmask. A bit set at index `i` indicates that TPC `i` is part of the
		28	GPC at that list index. Obtained via GPU register reads in `nvdebug`.
		29	"""
		30	num_gpcs = ctypes.c_uint()
		31	tpc_masks = ctypes.pointer(ctypes.c_ulonglong())
		32	res = libsmctrl.libsmctrl_get_gpc_info(ctypes.byref(num_gpcs), ctypes.byref(tpc_masks), device_num)
		33	if res != 0:
		34	print("pysmctrl: Unable to call libsmctrl_get_gpc_info(). Raising error %d..."%res)
		35	raise OSError(res, os.strerror(res))
		36	return [tpc_masks[i] for i in range(num_gpcs.value)]
		37
		38	def get_tpc_info(device_num):
		39	"""
		40	Obtain a count of the total number of thread processing clusters (TPCs)
		41	enabled on the specified GPU.
		42
		43	Parameters
		44	----------
		45	device_num : int
		46	Which device to obtain TPC count for (starts as 0, order is defined by
		47	`nvdebug` module). May not match CUDA device numbering.
		48
		49	Returns
		50	-------
		51	int
		52	Count of enabled TPCs. Obtained via GPU register reads in `nvdebug`.
		53	"""
		54	num_tpcs = ctypes.c_uint()
		55	res = libsmctrl.libsmctrl_get_tpc_info(ctypes.byref(num_tpcs), device_num)
		56	if res != 0:
		57	print("pysmctrl: Unable to call libsmctrl_get_tpc_info(). Raising error %d..."%res)
		58	raise OSError(res, os.strerror(res))
		59	return num_tpcs.value
		60
		61	def get_tpc_info_cuda(device_num):
		62	"""
		63	Obtain a count of the total number of thread processing clusters (TPCs)
		64	enabled on the specified GPU.
		65
		66	Parameters
		67	----------
		68	device_num : int
		69	Which device to obtain TPC count for, as a CUDA device ID.
		70
		71	Returns
		72	-------
		73	int
		74	Count of enabled TPCs. Obtained via calculations on data from CUDA.
		75	"""
		76	num_tpcs = ctypes.c_uint()
		77	res = libsmctrl.libsmctrl_get_tpc_info_cuda(ctypes.byref(num_tpcs), device_num)
		78	if res != 0:
		79	print("pysmctrl: Unable to call libsmctrl_get_tpc_info_cuda(). Raising error %d..."%res)
		80	raise OSError(res, os.strerror(res))
		81	return num_tpcs.value
		82