aboutsummaryrefslogtreecommitdiffstats
path: root/libsmctrl.c
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2023-03-16 02:18:35 -0400
committerJoshua Bakita <bakitajoshua@gmail.com>2023-03-16 02:18:35 -0400
commit9ed721de0e9ce564b7c852e38359398b019a5c2f (patch)
tree8dc52ba6d8521741d2c1aa4030196e54670c2efc /libsmctrl.c
parentd827c6c152c8dd52463f82ef11ccdfc66083a9db (diff)
Introduce pysmctrl: A python interface to libsmctrl
Initially supports the GPU information functions via: - pysmctrl.get_gpc_info(dev_id) - pysmctrl.get_tpc_info(dev_id) - pysmctrl.get_tpc_info_cuda(cuda_dev_id) All functions are extensively documented. See pysmctrl/__init__.py for details. Device partitioning functions have yet to be mapped into Python, as these will require more testing. As part of this: - libsmctrl_get_*_info() functions have been modified to consistently return positive error codes. - libsmctrl_get_tpc_info() now uses nvdebug-style device numbering and uses libsmctrl_get_gpc_info() under the covers. This should be more reliable. - libsmctrl_get_tpc_info_cuda() has been introduced as an improved version of the old libsmctrl_get_tpc_info() function. This continues to use CUDA-style device numbering, but is now resiliant to CUDA failures. - Various minor style improvements in libsmctrl.c
Diffstat (limited to 'libsmctrl.c')
-rw-r--r--libsmctrl.c91
1 files changed, 60 insertions, 31 deletions
diff --git a/libsmctrl.c b/libsmctrl.c
index 640001a..98be1ef 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -226,13 +226,15 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
226 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); 226 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF);
227 break; 227 break;
228 default: { 228 default: {
229 // For experimenting to determine the right mask offset, set MASK_OFF (positive and negative numbers supported) 229 // For experimenting to determine the right mask offset, set the MASK_OFF
230 // environment variable (positive and negative numbers are supported)
230 char* mask_off_str = getenv("MASK_OFF"); 231 char* mask_off_str = getenv("MASK_OFF");
231 fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); 232 fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver);
232 if (mask_off_str) { 233 if (mask_off_str) {
233 int off = atoi(mask_off_str); 234 int off = atoi(mask_off_str);
234 fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 11.8 base %#x (total off: %#x)\n", off, CU_11_8_MASK_OFF, CU_11_8_MASK_OFF+off); 235 fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x "
235 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF + off); 236 "(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off);
237 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off);
236 } else { 238 } else {
237 return; 239 return;
238 }} 240 }}
@@ -242,44 +244,21 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
242 hw_mask->lower = mask; 244 hw_mask->lower = mask;
243} 245}
244 246
245int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { 247/* INFORMATIONAL FUNCTIONS */
246 int num_sms;
247 int major;
248 int minor;
249 // TODO: Use nvdebug instead of this hardcoded hack
250 cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
251 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
252 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
253 // SM masking only works on sm_35+
254 if (major < 3 || (major == 3 && minor < 5))
255 return -ENOTSUP;
256 // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
257 // as the P100, which is uniquely sm_60
258 int sms_per_tpc;
259 if (major > 6 || (major == 6 && minor == 0))
260 sms_per_tpc = 2;
261 else
262 sms_per_tpc = 1;
263 // It looks like there may be some upcoming weirdness (TPCs with only one SM?)
264 // with Hopper
265 if (major >= 9)
266 fprintf(stderr, "libsmctrl: WARNING, SM masking is untested on Hopper, and will likely yield incorrect results! Proceed with caution.\n");
267 *num_tpcs = num_sms/sms_per_tpc;
268 return 0;
269}
270 248
271// Read an integer from a file in `/proc` 249// Read an integer from a file in `/proc`
272static int read_int_procfile(char* filename, uint64_t* out) { 250static int read_int_procfile(char* filename, uint64_t* out) {
273 char f_data[18] = {0}; 251 char f_data[18] = {0};
274 int fd = open(filename, O_RDONLY); 252 int fd = open(filename, O_RDONLY);
275 if (fd == -1) 253 if (fd == -1)
276 return -errno; 254 return errno;
277 read(fd, f_data, 18); 255 read(fd, f_data, 18);
278 close(fd); 256 close(fd);
279 *out = strtoll(f_data, NULL, 16); 257 *out = strtoll(f_data, NULL, 16);
280 return 0; 258 return 0;
281} 259}
282 260
261// We support up to 12 GPCs per GPU, and up to 16 GPUs.
283static uint64_t tpc_mask_per_gpc_per_dev[16][12]; 262static uint64_t tpc_mask_per_gpc_per_dev[16][12];
284// Output mask is vtpc-indexed (virtual TPC) 263// Output mask is vtpc-indexed (virtual TPC)
285int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) { 264int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) {
@@ -291,13 +270,14 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc,
291 // Maximum number of GPCs supported for this chip 270 // Maximum number of GPCs supported for this chip
292 snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev); 271 snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev);
293 if (err = read_int_procfile(filename, &max_gpcs)) { 272 if (err = read_int_procfile(filename, &max_gpcs)) {
294 fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before using libsmctrl_get_gpc_info()\n"); 273 fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before "
274 "using libsmctrl_get_*_info() functions\n");
295 return err; 275 return err;
296 } 276 }
297 // TODO: handle arbitrary-size GPUs 277 // TODO: handle arbitrary-size GPUs
298 if (dev > 16 || max_gpcs > 12) { 278 if (dev > 16 || max_gpcs > 12) {
299 fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n"); 279 fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n");
300 return -ERANGE; 280 return ERANGE;
301 } 281 }
302 // Set bit = disabled GPC 282 // Set bit = disabled GPC
303 snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev); 283 snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev);
@@ -331,3 +311,52 @@ int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc,
331 return 0; 311 return 0;
332} 312}
333 313
314int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {
315 uint32_t num_gpcs;
316 uint64_t* tpcs_per_gpc;
317 int res;
318 if (res = libsmctrl_get_gpc_info(&num_gpcs, &tpcs_per_gpc, dev))
319 return res;
320 *num_tpcs = 0;
321 for (int gpc = 0; gpc < num_gpcs; gpc++) {
322 *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]);
323 }
324 return 0;
325}
326
327// @param dev Device index as understood by CUDA **can differ from nvdebug idx**
328// This implementation is fragile, and could be incorrect for odd GPUs
329int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) {
330 int num_sms, major, minor, res = 0;
331 const char* err_str;
332 if (res = cuInit(0))
333 goto abort_cuda;
334 if (res = cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_dev))
335 goto abort_cuda;
336 if (res = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_dev))
337 goto abort_cuda;
338 if (res = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_dev))
339 goto abort_cuda;
340 // SM masking only works on sm_35+
341 if (major < 3 || (major == 3 && minor < 5))
342 return ENOTSUP;
343 // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
344 // as the P100, which is uniquely sm_60
345 int sms_per_tpc;
346 if (major > 6 || (major == 6 && minor == 0))
347 sms_per_tpc = 2;
348 else
349 sms_per_tpc = 1;
350 // It looks like there may be some upcoming weirdness (TPCs with only one SM?)
351 // with Hopper
352 if (major >= 9)
353 fprintf(stderr, "libsmctrl: WARNING, TPC masking is untested on Hopper,"
354 " and will likely yield incorrect results! Proceed with caution.\n");
355 *num_tpcs = num_sms/sms_per_tpc;
356 return 0;
357abort_cuda:
358 cuGetErrorName(res, &err_str);
359 fprintf(stderr, "libsmctrl: CUDA call failed due to %s. Failing with EIO...\n", err_str);
360 return EIO;
361}
362