diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2023-03-16 02:18:35 -0400 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2023-03-16 02:18:35 -0400 |
commit | 9ed721de0e9ce564b7c852e38359398b019a5c2f (patch) | |
tree | 8dc52ba6d8521741d2c1aa4030196e54670c2efc | |
parent | d827c6c152c8dd52463f82ef11ccdfc66083a9db (diff) |
Introduce pysmctrl: A python interface to libsmctrl
Initially supports the GPU information functions via:
- pysmctrl.get_gpc_info(dev_id)
- pysmctrl.get_tpc_info(dev_id)
- pysmctrl.get_tpc_info_cuda(cuda_dev_id)
All functions are extensively documented. See pysmctrl/__init__.py
for details.
Device partitioning functions have yet to be mapped into Python, as
these will require more testing.
As part of this:
- libsmctrl_get_*_info() functions have been modified to consistently
return positive error codes.
- libsmctrl_get_tpc_info() now uses nvdebug-style device numbering and
uses libsmctrl_get_gpc_info() under the covers. This should be more
reliable.
- libsmctrl_get_tpc_info_cuda() has been introduced as an improved
version of the old libsmctrl_get_tpc_info() function. This continues
to use CUDA-style device numbering, but is now resiliant to CUDA
failures.
- Various minor style improvements in libsmctrl.c
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | libsmctrl.c | 91 | ||||
-rw-r--r-- | libsmctrl.h | 16 | ||||
-rw-r--r-- | pysmctrl/__init__.py | 82 |
4 files changed, 154 insertions, 36 deletions
@@ -2,3 +2,4 @@ libsmctrl.a | |||
2 | libsmctrl.o | 2 | libsmctrl.o |
3 | libsmctrl.so | 3 | libsmctrl.so |
4 | libsmctrl_test_gpc_info | 4 | libsmctrl_test_gpc_info |
5 | *.pyc | ||
diff --git a/libsmctrl.c b/libsmctrl.c index 640001a..98be1ef 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
@@ -226,13 +226,15 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | |||
226 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); | 226 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); |
227 | break; | 227 | break; |
228 | default: { | 228 | default: { |
229 | // For experimenting to determine the right mask offset, set MASK_OFF (positive and negative numbers supported) | 229 | // For experimenting to determine the right mask offset, set the MASK_OFF |
230 | // environment variable (positive and negative numbers are supported) | ||
230 | char* mask_off_str = getenv("MASK_OFF"); | 231 | char* mask_off_str = getenv("MASK_OFF"); |
231 | fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); | 232 | fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); |
232 | if (mask_off_str) { | 233 | if (mask_off_str) { |
233 | int off = atoi(mask_off_str); | 234 | int off = atoi(mask_off_str); |
234 | fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 11.8 base %#x (total off: %#x)\n", off, CU_11_8_MASK_OFF, CU_11_8_MASK_OFF+off); | 235 | fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x " |
235 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF + off); | 236 | "(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off); |
237 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off); | ||
236 | } else { | 238 | } else { |
237 | return; | 239 | return; |
238 | }} | 240 | }} |
@@ -242,44 +244,21 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | |||
242 | hw_mask->lower = mask; | 244 | hw_mask->lower = mask; |
243 | } | 245 | } |
244 | 246 | ||
245 | int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { | 247 | /* INFORMATIONAL FUNCTIONS */ |
246 | int num_sms; | ||
247 | int major; | ||
248 | int minor; | ||
249 | // TODO: Use nvdebug instead of this hardcoded hack | ||
250 | cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); | ||
251 | cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); | ||
252 | cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); | ||
253 | // SM masking only works on sm_35+ | ||
254 | if (major < 3 || (major == 3 && minor < 5)) | ||
255 | return -ENOTSUP; | ||
256 | // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well | ||
257 | // as the P100, which is uniquely sm_60 | ||
258 | int sms_per_tpc; | ||
259 | if (major > 6 || (major == 6 && minor == 0)) | ||
260 | sms_per_tpc = 2; | ||
261 | else | ||
262 | sms_per_tpc = 1; | ||
263 | // It looks like there may be some upcoming weirdness (TPCs with only one SM?) | ||
264 | // with Hopper | ||
265 | if (major >= 9) | ||
266 | fprintf(stderr, "libsmctrl: WARNING, SM masking is untested on Hopper, and will likely yield incorrect results! Proceed with caution.\n"); | ||
267 | *num_tpcs = num_sms/sms_per_tpc; | ||
268 | return 0; | ||
269 | } | ||
270 | 248 | ||
271 | // Read an integer from a file in `/proc` | 249 | // Read an integer from a file in `/proc` |
272 | static int read_int_procfile(char* filename, uint64_t* out) { | 250 | static int read_int_procfile(char* filename, uint64_t* out) { |
273 | char f_data[18] = {0}; | 251 | char f_data[18] = {0}; |
274 | int fd = open(filename, O_RDONLY); | 252 | int fd = open(filename, O_RDONLY); |
275 | if (fd == -1) | 253 | if (fd == -1) |
276 | return -errno; | 254 | return errno; |
277 | read(fd, f_data, 18); | 255 | read(fd, f_data, 18); |
278 | close(fd); | 256 | close(fd); |
279 | *out = strtoll(f_data, NULL, 16); | 257 | *out = strtoll(f_data, NULL, 16); |
280 | return 0; | 258 | return 0; |
281 | } | 259 | } |
282 | 260 | ||
261 | // We support up to 12 GPCs per GPU, and up to 16 GPUs. | ||
283 | static uint64_t tpc_mask_per_gpc_per_dev[16][12]; | 262 | static uint64_t tpc_mask_per_gpc_per_dev[16][12]; |
284 | // Output mask is vtpc-indexed (virtual TPC) | 263 | // Output mask is vtpc-indexed (virtual TPC) |
285 | int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) { | 264 | int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) { |
@@ -291,13 +270,14 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, | |||
291 | // Maximum number of GPCs supported for this chip | 270 | // Maximum number of GPCs supported for this chip |
292 | snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev); | 271 | snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev); |
293 | if (err = read_int_procfile(filename, &max_gpcs)) { | 272 | if (err = read_int_procfile(filename, &max_gpcs)) { |
294 | fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before using libsmctrl_get_gpc_info()\n"); | 273 | fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before " |
274 | "using libsmctrl_get_*_info() functions\n"); | ||
295 | return err; | 275 | return err; |
296 | } | 276 | } |
297 | // TODO: handle arbitrary-size GPUs | 277 | // TODO: handle arbitrary-size GPUs |
298 | if (dev > 16 || max_gpcs > 12) { | 278 | if (dev > 16 || max_gpcs > 12) { |
299 | fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n"); | 279 | fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n"); |
300 | return -ERANGE; | 280 | return ERANGE; |
301 | } | 281 | } |
302 | // Set bit = disabled GPC | 282 | // Set bit = disabled GPC |
303 | snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev); | 283 | snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev); |
@@ -331,3 +311,52 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, | |||
331 | return 0; | 311 | return 0; |
332 | } | 312 | } |
333 | 313 | ||
314 | int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { | ||
315 | uint32_t num_gpcs; | ||
316 | uint64_t* tpcs_per_gpc; | ||
317 | int res; | ||
318 | if (res = libsmctrl_get_gpc_info(&num_gpcs, &tpcs_per_gpc, dev)) | ||
319 | return res; | ||
320 | *num_tpcs = 0; | ||
321 | for (int gpc = 0; gpc < num_gpcs; gpc++) { | ||
322 | *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]); | ||
323 | } | ||
324 | return 0; | ||
325 | } | ||
326 | |||
327 | // @param dev Device index as understood by CUDA **can differ from nvdebug idx** | ||
328 | // This implementation is fragile, and could be incorrect for odd GPUs | ||
329 | int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) { | ||
330 | int num_sms, major, minor, res = 0; | ||
331 | const char* err_str; | ||
332 | if (res = cuInit(0)) | ||
333 | goto abort_cuda; | ||
334 | if (res = cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_dev)) | ||
335 | goto abort_cuda; | ||
336 | if (res = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_dev)) | ||
337 | goto abort_cuda; | ||
338 | if (res = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_dev)) | ||
339 | goto abort_cuda; | ||
340 | // SM masking only works on sm_35+ | ||
341 | if (major < 3 || (major == 3 && minor < 5)) | ||
342 | return ENOTSUP; | ||
343 | // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well | ||
344 | // as the P100, which is uniquely sm_60 | ||
345 | int sms_per_tpc; | ||
346 | if (major > 6 || (major == 6 && minor == 0)) | ||
347 | sms_per_tpc = 2; | ||
348 | else | ||
349 | sms_per_tpc = 1; | ||
350 | // It looks like there may be some upcoming weirdness (TPCs with only one SM?) | ||
351 | // with Hopper | ||
352 | if (major >= 9) | ||
353 | fprintf(stderr, "libsmctrl: WARNING, TPC masking is untested on Hopper," | ||
354 | " and will likely yield incorrect results! Proceed with caution.\n"); | ||
355 | *num_tpcs = num_sms/sms_per_tpc; | ||
356 | return 0; | ||
357 | abort_cuda: | ||
358 | cuGetErrorName(res, &err_str); | ||
359 | fprintf(stderr, "libsmctrl: CUDA call failed due to %s. Failing with EIO...\n", err_str); | ||
360 | return EIO; | ||
361 | } | ||
362 | |||
diff --git a/libsmctrl.h b/libsmctrl.h index f342afa..64ae7a7 100644 --- a/libsmctrl.h +++ b/libsmctrl.h | |||
@@ -50,15 +50,21 @@ extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_ | |||
50 | 50 | ||
51 | /* INFORMATIONAL FUNCTIONS */ | 51 | /* INFORMATIONAL FUNCTIONS */ |
52 | 52 | ||
53 | // Get total number of TPCs on device number `dev`. | ||
54 | extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev); | ||
55 | // Get number of GPCs for devices number `dev`, and a GPC-indexed array | 53 | // Get number of GPCs for devices number `dev`, and a GPC-indexed array |
56 | // containing masks of which TPCs are associated with each GPC. | 54 | // containing masks of which TPCs are associated with each GPC. |
57 | // Note that the `nvdebug` module must be loaded to use this function. | 55 | // Note that the `nvdebug` module must be loaded to use this function. |
58 | // @param num_enabled_gpcs Location to store number of GPCs in | 56 | // @param num_enabled_gpcs (out) Location to store number of GPCs in |
59 | // @param tpcs_for_gpc Pointer to store pointer to output buffer at | 57 | // @param tpcs_for_gpc (out) Pointer to store pointer to output buffer at |
60 | // @return 0 on success, error code on error | 58 | // @param dev (in) `nvdebug` device ID |
59 | // @return 0 on success, `errno`-compatible error code on failure | ||
61 | extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev); | 60 | extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev); |
61 | // Get total number of TPCs on device number `dev`. Requires `nvdebug`. | ||
62 | // @param num_tpcs (out) Location to store number of TPCs at | ||
63 | // @param dev (in) `nvdebug` device ID | ||
64 | // @return 0 on success, `errno`-compatible error code on failure | ||
65 | extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev); | ||
66 | // Identical to above, but for a CUDA device ID. Does not require `nvdebug`. | ||
67 | extern int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev); | ||
62 | 68 | ||
63 | #ifdef __cplusplus | 69 | #ifdef __cplusplus |
64 | } | 70 | } |
diff --git a/pysmctrl/__init__.py b/pysmctrl/__init__.py new file mode 100644 index 0000000..5dc6175 --- /dev/null +++ b/pysmctrl/__init__.py | |||
@@ -0,0 +1,82 @@ | |||
1 | import ctypes, ctypes.util | ||
2 | import os | ||
3 | |||
4 | # If this is failing, make sure that the directory containing libsmctrl.so is | ||
5 | # in your LD_LIBRARY_PATH environment variable. You likely need something like: | ||
6 | # LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/playpen/jbakita/gpu_subdiv/libsmctrl/ | ||
7 | libsmctrl_path = ctypes.util.find_library("libsmctrl") | ||
8 | if not libsmctrl_path: | ||
9 | libsmctrl_path = __path__[0] + "/../libsmctrl.so" | ||
10 | libsmctrl = ctypes.CDLL(libsmctrl_path) | ||
11 | |||
12 | def get_gpc_info(device_num): | ||
13 | """ | ||
14 | Obtain list of thread processing clusters (TPCs) enabled for each general | ||
15 | processing cluster (GPC) in the specified GPU. | ||
16 | |||
17 | Parameters | ||
18 | ---------- | ||
19 | device_num : int | ||
20 | Which device to obtain information for (starts as 0, order is defined | ||
21 | by nvdebug module). May not match CUDA device numbering. | ||
22 | |||
23 | Returns | ||
24 | ------- | ||
25 | list of int64 | ||
26 | A list as long as the number of GPCs enabled, where each list entry is | ||
27 | a bitmask. A bit set at index `i` indicates that TPC `i` is part of the | ||
28 | GPC at that list index. Obtained via GPU register reads in `nvdebug`. | ||
29 | """ | ||
30 | num_gpcs = ctypes.c_uint() | ||
31 | tpc_masks = ctypes.pointer(ctypes.c_ulonglong()) | ||
32 | res = libsmctrl.libsmctrl_get_gpc_info(ctypes.byref(num_gpcs), ctypes.byref(tpc_masks), device_num) | ||
33 | if res != 0: | ||
34 | print("pysmctrl: Unable to call libsmctrl_get_gpc_info(). Raising error %d..."%res) | ||
35 | raise OSError(res, os.strerror(res)) | ||
36 | return [tpc_masks[i] for i in range(num_gpcs.value)] | ||
37 | |||
38 | def get_tpc_info(device_num): | ||
39 | """ | ||
40 | Obtain a count of the total number of thread processing clusters (TPCs) | ||
41 | enabled on the specified GPU. | ||
42 | |||
43 | Parameters | ||
44 | ---------- | ||
45 | device_num : int | ||
46 | Which device to obtain TPC count for (starts as 0, order is defined by | ||
47 | `nvdebug` module). May not match CUDA device numbering. | ||
48 | |||
49 | Returns | ||
50 | ------- | ||
51 | int | ||
52 | Count of enabled TPCs. Obtained via GPU register reads in `nvdebug`. | ||
53 | """ | ||
54 | num_tpcs = ctypes.c_uint() | ||
55 | res = libsmctrl.libsmctrl_get_tpc_info(ctypes.byref(num_tpcs), device_num) | ||
56 | if res != 0: | ||
57 | print("pysmctrl: Unable to call libsmctrl_get_tpc_info(). Raising error %d..."%res) | ||
58 | raise OSError(res, os.strerror(res)) | ||
59 | return num_tpcs.value | ||
60 | |||
61 | def get_tpc_info_cuda(device_num): | ||
62 | """ | ||
63 | Obtain a count of the total number of thread processing clusters (TPCs) | ||
64 | enabled on the specified GPU. | ||
65 | |||
66 | Parameters | ||
67 | ---------- | ||
68 | device_num : int | ||
69 | Which device to obtain TPC count for, as a CUDA device ID. | ||
70 | |||
71 | Returns | ||
72 | ------- | ||
73 | int | ||
74 | Count of enabled TPCs. Obtained via calculations on data from CUDA. | ||
75 | """ | ||
76 | num_tpcs = ctypes.c_uint() | ||
77 | res = libsmctrl.libsmctrl_get_tpc_info_cuda(ctypes.byref(num_tpcs), device_num) | ||
78 | if res != 0: | ||
79 | print("pysmctrl: Unable to call libsmctrl_get_tpc_info_cuda(). Raising error %d..."%res) | ||
80 | raise OSError(res, os.strerror(res)) | ||
81 | return num_tpcs.value | ||
82 | |||