aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2023-03-16 02:18:35 -0400
committerJoshua Bakita <bakitajoshua@gmail.com>2023-03-16 02:18:35 -0400
commit9ed721de0e9ce564b7c852e38359398b019a5c2f (patch)
tree8dc52ba6d8521741d2c1aa4030196e54670c2efc
parentd827c6c152c8dd52463f82ef11ccdfc66083a9db (diff)
Introduce pysmctrl: A python interface to libsmctrl
Initially supports the GPU information functions via: - pysmctrl.get_gpc_info(dev_id) - pysmctrl.get_tpc_info(dev_id) - pysmctrl.get_tpc_info_cuda(cuda_dev_id) All functions are extensively documented. See pysmctrl/__init__.py for details. Device partitioning functions have yet to be mapped into Python, as these will require more testing. As part of this: - libsmctrl_get_*_info() functions have been modified to consistently return positive error codes. - libsmctrl_get_tpc_info() now uses nvdebug-style device numbering and uses libsmctrl_get_gpc_info() under the covers. This should be more reliable. - libsmctrl_get_tpc_info_cuda() has been introduced as an improved version of the old libsmctrl_get_tpc_info() function. This continues to use CUDA-style device numbering, but is now resiliant to CUDA failures. - Various minor style improvements in libsmctrl.c
-rw-r--r--.gitignore1
-rw-r--r--libsmctrl.c91
-rw-r--r--libsmctrl.h16
-rw-r--r--pysmctrl/__init__.py82
4 files changed, 154 insertions, 36 deletions
diff --git a/.gitignore b/.gitignore
index dcff266..437f923 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ libsmctrl.a
2libsmctrl.o 2libsmctrl.o
3libsmctrl.so 3libsmctrl.so
4libsmctrl_test_gpc_info 4libsmctrl_test_gpc_info
5*.pyc
diff --git a/libsmctrl.c b/libsmctrl.c
index 640001a..98be1ef 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -226,13 +226,15 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
226 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); 226 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF);
227 break; 227 break;
228 default: { 228 default: {
229 // For experimenting to determine the right mask offset, set MASK_OFF (positive and negative numbers supported) 229 // For experimenting to determine the right mask offset, set the MASK_OFF
230 // environment variable (positive and negative numbers are supported)
230 char* mask_off_str = getenv("MASK_OFF"); 231 char* mask_off_str = getenv("MASK_OFF");
231 fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); 232 fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver);
232 if (mask_off_str) { 233 if (mask_off_str) {
233 int off = atoi(mask_off_str); 234 int off = atoi(mask_off_str);
234 fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 11.8 base %#x (total off: %#x)\n", off, CU_11_8_MASK_OFF, CU_11_8_MASK_OFF+off); 235 fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x "
235 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF + off); 236 "(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off);
237 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off);
236 } else { 238 } else {
237 return; 239 return;
238 }} 240 }}
@@ -242,44 +244,21 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
242 hw_mask->lower = mask; 244 hw_mask->lower = mask;
243} 245}
244 246
245int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { 247/* INFORMATIONAL FUNCTIONS */
246 int num_sms;
247 int major;
248 int minor;
249 // TODO: Use nvdebug instead of this hardcoded hack
250 cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
251 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
252 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
253 // SM masking only works on sm_35+
254 if (major < 3 || (major == 3 && minor < 5))
255 return -ENOTSUP;
256 // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
257 // as the P100, which is uniquely sm_60
258 int sms_per_tpc;
259 if (major > 6 || (major == 6 && minor == 0))
260 sms_per_tpc = 2;
261 else
262 sms_per_tpc = 1;
263 // It looks like there may be some upcoming weirdness (TPCs with only one SM?)
264 // with Hopper
265 if (major >= 9)
266 fprintf(stderr, "libsmctrl: WARNING, SM masking is untested on Hopper, and will likely yield incorrect results! Proceed with caution.\n");
267 *num_tpcs = num_sms/sms_per_tpc;
268 return 0;
269}
270 248
271// Read an integer from a file in `/proc` 249// Read an integer from a file in `/proc`
272static int read_int_procfile(char* filename, uint64_t* out) { 250static int read_int_procfile(char* filename, uint64_t* out) {
273 char f_data[18] = {0}; 251 char f_data[18] = {0};
274 int fd = open(filename, O_RDONLY); 252 int fd = open(filename, O_RDONLY);
275 if (fd == -1) 253 if (fd == -1)
276 return -errno; 254 return errno;
277 read(fd, f_data, 18); 255 read(fd, f_data, 18);
278 close(fd); 256 close(fd);
279 *out = strtoll(f_data, NULL, 16); 257 *out = strtoll(f_data, NULL, 16);
280 return 0; 258 return 0;
281} 259}
282 260
261// We support up to 12 GPCs per GPU, and up to 16 GPUs.
283static uint64_t tpc_mask_per_gpc_per_dev[16][12]; 262static uint64_t tpc_mask_per_gpc_per_dev[16][12];
284// Output mask is vtpc-indexed (virtual TPC) 263// Output mask is vtpc-indexed (virtual TPC)
285int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) { 264int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) {
@@ -291,13 +270,14 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc,
291 // Maximum number of GPCs supported for this chip 270 // Maximum number of GPCs supported for this chip
292 snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev); 271 snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev);
293 if (err = read_int_procfile(filename, &max_gpcs)) { 272 if (err = read_int_procfile(filename, &max_gpcs)) {
294 fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before using libsmctrl_get_gpc_info()\n"); 273 fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before "
274 "using libsmctrl_get_*_info() functions\n");
295 return err; 275 return err;
296 } 276 }
297 // TODO: handle arbitrary-size GPUs 277 // TODO: handle arbitrary-size GPUs
298 if (dev > 16 || max_gpcs > 12) { 278 if (dev > 16 || max_gpcs > 12) {
299 fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n"); 279 fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n");
300 return -ERANGE; 280 return ERANGE;
301 } 281 }
302 // Set bit = disabled GPC 282 // Set bit = disabled GPC
303 snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev); 283 snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev);
@@ -331,3 +311,52 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc,
331 return 0; 311 return 0;
332} 312}
333 313
314int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {
315 uint32_t num_gpcs;
316 uint64_t* tpcs_per_gpc;
317 int res;
318 if (res = libsmctrl_get_gpc_info(&num_gpcs, &tpcs_per_gpc, dev))
319 return res;
320 *num_tpcs = 0;
321 for (int gpc = 0; gpc < num_gpcs; gpc++) {
322 *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]);
323 }
324 return 0;
325}
326
327// @param dev Device index as understood by CUDA **can differ from nvdebug idx**
328// This implementation is fragile, and could be incorrect for odd GPUs
329int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) {
330 int num_sms, major, minor, res = 0;
331 const char* err_str;
332 if (res = cuInit(0))
333 goto abort_cuda;
334 if (res = cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_dev))
335 goto abort_cuda;
336 if (res = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_dev))
337 goto abort_cuda;
338 if (res = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_dev))
339 goto abort_cuda;
340 // SM masking only works on sm_35+
341 if (major < 3 || (major == 3 && minor < 5))
342 return ENOTSUP;
343 // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
344 // as the P100, which is uniquely sm_60
345 int sms_per_tpc;
346 if (major > 6 || (major == 6 && minor == 0))
347 sms_per_tpc = 2;
348 else
349 sms_per_tpc = 1;
350 // It looks like there may be some upcoming weirdness (TPCs with only one SM?)
351 // with Hopper
352 if (major >= 9)
353 fprintf(stderr, "libsmctrl: WARNING, TPC masking is untested on Hopper,"
354 " and will likely yield incorrect results! Proceed with caution.\n");
355 *num_tpcs = num_sms/sms_per_tpc;
356 return 0;
357abort_cuda:
358 cuGetErrorName(res, &err_str);
359 fprintf(stderr, "libsmctrl: CUDA call failed due to %s. Failing with EIO...\n", err_str);
360 return EIO;
361}
362
diff --git a/libsmctrl.h b/libsmctrl.h
index f342afa..64ae7a7 100644
--- a/libsmctrl.h
+++ b/libsmctrl.h
@@ -50,15 +50,21 @@ extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_
50 50
51/* INFORMATIONAL FUNCTIONS */ 51/* INFORMATIONAL FUNCTIONS */
52 52
53// Get total number of TPCs on device number `dev`.
54extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev);
55// Get number of GPCs for devices number `dev`, and a GPC-indexed array 53// Get number of GPCs for devices number `dev`, and a GPC-indexed array
56// containing masks of which TPCs are associated with each GPC. 54// containing masks of which TPCs are associated with each GPC.
57// Note that the `nvdebug` module must be loaded to use this function. 55// Note that the `nvdebug` module must be loaded to use this function.
58// @param num_enabled_gpcs Location to store number of GPCs in 56// @param num_enabled_gpcs (out) Location to store number of GPCs in
59// @param tpcs_for_gpc Pointer to store pointer to output buffer at 57// @param tpcs_for_gpc (out) Pointer to store pointer to output buffer at
60// @return 0 on success, error code on error 58// @param dev (in) `nvdebug` device ID
59// @return 0 on success, `errno`-compatible error code on failure
61extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev); 60extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev);
61// Get total number of TPCs on device number `dev`. Requires `nvdebug`.
62// @param num_tpcs (out) Location to store number of TPCs at
63// @param dev (in) `nvdebug` device ID
64// @return 0 on success, `errno`-compatible error code on failure
65extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev);
66// Identical to above, but for a CUDA device ID. Does not require `nvdebug`.
67extern int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev);
62 68
63#ifdef __cplusplus 69#ifdef __cplusplus
64} 70}
diff --git a/pysmctrl/__init__.py b/pysmctrl/__init__.py
new file mode 100644
index 0000000..5dc6175
--- /dev/null
+++ b/pysmctrl/__init__.py
@@ -0,0 +1,82 @@
1import ctypes, ctypes.util
2import os
3
4# If this is failing, make sure that the directory containing libsmctrl.so is
5# in your LD_LIBRARY_PATH environment variable. You likely need something like:
6# LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/playpen/jbakita/gpu_subdiv/libsmctrl/
7libsmctrl_path = ctypes.util.find_library("libsmctrl")
8if not libsmctrl_path:
9 libsmctrl_path = __path__[0] + "/../libsmctrl.so"
10libsmctrl = ctypes.CDLL(libsmctrl_path)
11
12def get_gpc_info(device_num):
13 """
14 Obtain list of thread processing clusters (TPCs) enabled for each general
15 processing cluster (GPC) in the specified GPU.
16
17 Parameters
18 ----------
19 device_num : int
20 Which device to obtain information for (starts as 0, order is defined
21 by nvdebug module). May not match CUDA device numbering.
22
23 Returns
24 -------
25 list of int64
26 A list as long as the number of GPCs enabled, where each list entry is
27 a bitmask. A bit set at index `i` indicates that TPC `i` is part of the
28 GPC at that list index. Obtained via GPU register reads in `nvdebug`.
29 """
30 num_gpcs = ctypes.c_uint()
31 tpc_masks = ctypes.pointer(ctypes.c_ulonglong())
32 res = libsmctrl.libsmctrl_get_gpc_info(ctypes.byref(num_gpcs), ctypes.byref(tpc_masks), device_num)
33 if res != 0:
34 print("pysmctrl: Unable to call libsmctrl_get_gpc_info(). Raising error %d..."%res)
35 raise OSError(res, os.strerror(res))
36 return [tpc_masks[i] for i in range(num_gpcs.value)]
37
38def get_tpc_info(device_num):
39 """
40 Obtain a count of the total number of thread processing clusters (TPCs)
41 enabled on the specified GPU.
42
43 Parameters
44 ----------
45 device_num : int
46 Which device to obtain TPC count for (starts as 0, order is defined by
47 `nvdebug` module). May not match CUDA device numbering.
48
49 Returns
50 -------
51 int
52 Count of enabled TPCs. Obtained via GPU register reads in `nvdebug`.
53 """
54 num_tpcs = ctypes.c_uint()
55 res = libsmctrl.libsmctrl_get_tpc_info(ctypes.byref(num_tpcs), device_num)
56 if res != 0:
57 print("pysmctrl: Unable to call libsmctrl_get_tpc_info(). Raising error %d..."%res)
58 raise OSError(res, os.strerror(res))
59 return num_tpcs.value
60
61def get_tpc_info_cuda(device_num):
62 """
63 Obtain a count of the total number of thread processing clusters (TPCs)
64 enabled on the specified GPU.
65
66 Parameters
67 ----------
68 device_num : int
69 Which device to obtain TPC count for, as a CUDA device ID.
70
71 Returns
72 -------
73 int
74 Count of enabled TPCs. Obtained via calculations on data from CUDA.
75 """
76 num_tpcs = ctypes.c_uint()
77 res = libsmctrl.libsmctrl_get_tpc_info_cuda(ctypes.byref(num_tpcs), device_num)
78 if res != 0:
79 print("pysmctrl: Unable to call libsmctrl_get_tpc_info_cuda(). Raising error %d..."%res)
80 raise OSError(res, os.strerror(res))
81 return num_tpcs.value
82