/** * Copyright 2022-2025 Joshua Bakita * Library to control SM masks on CUDA launches. Co-opts preexisting debug * logic in the CUDA driver library, and thus requires a build with -lcuda. * * This file implements partitioning via three different mechanisms: * - Modifying the QMD/TMD immediately prior to upload * - Changing a field in CUDA's stream struct that CUDA applies to the QMD/TMD * This table shows the mechanism used with each CUDA version: * +-----------+---------------+---------------+--------------+ * | Version | Global Mask | Stream Mask | Next Mask | * +-----------+---------------+---------------+--------------+ * | 8.0-12.8 | TMD/QMD Hook | stream struct | TMD/QMD Hook | * | 6.5-7.5 | TMD/QMD Hook | N/A | TMD/QMD Hook | * +-----------+---------------+---------------+--------------+ * "N/A" indicates that a mask type is unsupported on that CUDA version. * Please contact the authors if support is needed for a particular feature on * an older CUDA version. Support for those is unimplemented, not impossible. * * An old implementation of this file affected the global mask on CUDA 10.2 by * changing a field in CUDA's global struct that CUDA applies to the QMD/TMD. * That implementation was extraordinarily complicated, and was replaced in * 2024 with a more-backward-compatible way of hooking the TMD/QMD. * View the old implementation via Git: `git show aa63a02e:libsmctrl.c`. */ #define _GNU_SOURCE // To enable use of memfd_create() #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libsmctrl.h" // In functions that do not return an error code, we favor terminating with an // error rather than merely printing a warning and continuing. #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ __VA_ARGS__) /*** QMD/TMD-based SM Mask Control via Debug Callback. ***/ // Tested working on x86_64 CUDA 6.5, 9.1, and various 10+ versions // (No testing attempted on pre-CUDA-6.5 versions) // Values for the following three lines can be extracted by tracing CUPTI as // it interects with libcuda.so to set callbacks. static const CUuuid callback_funcs_id = {{0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}}; // These callback descriptors appear to intercept the TMD/QMD late enough that // CUDA has already applied the per-stream mask from its internal data // structures, allowing us to override it with the next mask. #define QMD_DOMAIN 0xb #define QMD_PRE_UPLOAD 0x1 /** * These globals must be non-static (i.e., have global linkage) to ensure that * if multiple copies of the library are loaded (e.g., dynamically linked to * both this program and a dependency), secondary copies do not attempt to * repeat initialization or make changes to unused copies of mask values. */ // Supreme mask (cannot be overridden) uint128_t *g_supreme_sm_mask = NULL; // Global mask (applies across all threads) uint64_t g_sm_mask = 0; // Next mask (applies per-thread) __thread uint64_t g_next_sm_mask = 0; // Flag value to indicate if setup has been completed bool sm_control_setup_called = false; #ifdef LIBSMCTRL_STATIC // Special handling for if built as a static library, and the libcuda.so.1 // libsmctrl wrapper is in use (see comment on setup() constructor for detail). static void (*shared_set_global_mask)(uint64_t) = NULL; static void (*shared_set_next_mask)(uint64_t) = NULL; #endif // v1 has been removed---it intercepted the TMD/QMD too early, making it // impossible to override the CUDA-injected stream mask with the next mask. static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in_params) { // ***Only tested on platforms with 64-bit pointers.*** // The first 8-byte element in `in_params` appears to be its size. `in_params` // must have at least five 8-byte elements for index four to be valid. if (*(uint32_t*)in_params < 5 * sizeof(void*)) abort(1, 0, "Unsupported CUDA version for callback-based SM masking. Aborting..."); // The fourth 8-byte element in `in_params` is a pointer to the TMD. Note // that this fourth pointer must exist---it only exists when the first // 8-byte element of `in_params` is at least 0x28 (checked above). void* tmd = *((void**)in_params + 4); if (!tmd) abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n"); uint32_t *lower_ptr, *upper_ptr, *ext_lower_ptr, *ext_upper_ptr; // The location of the TMD version field seems consistent across versions uint8_t tmd_ver = *(uint8_t*)(tmd + 72); if (tmd_ver >= 0x40) { // TMD V04_00 is used starting with Hopper to support masking >64 TPCs lower_ptr = tmd + 304; upper_ptr = tmd + 308; ext_lower_ptr = tmd + 312; ext_upper_ptr = tmd + 316; // XXX: Disable upper 64 TPCs until we have ...next_mask_ext and // ...global_mask_ext *ext_lower_ptr = -1; *ext_upper_ptr = -1; // An enable bit is also required *(uint32_t*)tmd |= 0x80000000; } else if (tmd_ver >= 0x16) { // TMD V01_06 is used starting with Kepler V2, and is the first to // support TPC masking lower_ptr = tmd + 84; upper_ptr = tmd + 88; } else { // TMD V00_06 is documented to not support SM masking abort(1, 0, "TMD version %04o is too old! This GPU does not support SM masking.\n", tmd_ver); } // Setting the next mask overrides both per-stream and global masks if (g_next_sm_mask) { *lower_ptr = (uint32_t)g_next_sm_mask; *upper_ptr = (uint32_t)(g_next_sm_mask >> 32); g_next_sm_mask = 0; } else if (!*lower_ptr && !*upper_ptr){ // Only apply the global mask if a per-stream mask hasn't been set *lower_ptr = (uint32_t)g_sm_mask; *upper_ptr = (uint32_t)(g_sm_mask >> 32); } // No one may override the supreme SM mask; any SMs disabled in it (set // bits) must always remain disabled. if (g_supreme_sm_mask) { *lower_ptr |= (uint32_t)*g_supreme_sm_mask; *upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 32); if (tmd_ver >= 0x40) { *ext_lower_ptr |= (uint32_t)(*g_supreme_sm_mask >> 64); *ext_upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 96); } } //fprintf(stderr, "Final SM Mask (lower): %x\n", *lower_ptr); //fprintf(stderr, "Final SM Mask (upper): %x\n", *upper_ptr); } static void setup_sm_control_callback() { int (*subscribe)(uint32_t* hndl, void(*callback)(void*, int, int, const void*), void* ukwn); int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid); uintptr_t* tbl_base; uint32_t my_hndl; // Avoid race conditions (setup should only run once) if (__atomic_test_and_set(&sm_control_setup_called, __ATOMIC_SEQ_CST)) return; #if CUDA_VERSION <= 6050 // Verify supported CUDA version // It's impossible for us to run with a version of CUDA older than we were // built by, so this check is excluded if built with CUDA > 6.5. int ver = 0; cuDriverGetVersion(&ver); if (ver < 6050) abort(1, ENOSYS, "Global or next masking requires at least CUDA 6.5; " "this application is using CUDA %d.%d", ver / 1000, (ver % 100)); #endif // Set up callback cuGetExportTable((const void**)&tbl_base, &callback_funcs_id); uintptr_t subscribe_func_addr = *(tbl_base + 3); uintptr_t enable_func_addr = *(tbl_base + 6); subscribe = (typeof(subscribe))subscribe_func_addr; enable = (typeof(enable))enable_func_addr; int res = 0; res = subscribe(&my_hndl, control_callback_v2, NULL); if (res) abort(1, 0, "Error subscribing to launch callback. CUDA returned error code %d.", res); res = enable(1, my_hndl, QMD_DOMAIN, QMD_PRE_UPLOAD); if (res) abort(1, 0, "Error enabling launch callback. CUDA returned error code %d.", res); } // Set default mask for all launches void libsmctrl_set_global_mask(uint64_t mask) { #ifdef LIBSMCTRL_STATIC // Special handling for if built as a static library, and the libcuda.so.1 // libsmctrl wrapper is in use (see comment on setup() constructor for // detail). if (shared_set_global_mask) return (*shared_set_global_mask)(mask); #endif setup_sm_control_callback(); g_sm_mask = mask; } // Set mask for next launch from this thread void libsmctrl_set_next_mask(uint64_t mask) { #ifdef LIBSMCTRL_STATIC // Special handling for if built as a static library, and the libcuda.so.1 // libsmctrl wrapper is in use (see comment on setup() constructor for // detail). if (shared_set_next_mask) return (*shared_set_next_mask)(mask); #endif setup_sm_control_callback(); g_next_sm_mask = mask; } /*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/ // Offsets for the stream struct on x86_64 // No offset appears to work with CUDA 6.5 (tried 0x0--0x1b4 w/ 4-byte step) // 6.5 tested on 340.118 #define CU_8_0_MASK_OFF 0xec #define CU_9_0_MASK_OFF 0x130 // CUDA 9.0 and 9.1 use the same offset // 9.1 tested on 390.157 #define CU_9_2_MASK_OFF 0x140 #define CU_10_0_MASK_OFF 0x244 // CUDA 10.0, 10.1 and 10.2 use the same offset // 10.1 tested on 418.113 // 10.2 tested on 440.100, 440.82, 440.64, and 440.36 #define CU_11_0_MASK_OFF 0x274 #define CU_11_1_MASK_OFF 0x2c4 #define CU_11_2_MASK_OFF 0x37c // CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset // 11.4 tested on 470.223.02 #define CU_11_6_MASK_OFF 0x38c #define CU_11_7_MASK_OFF 0x3c4 #define CU_11_8_MASK_OFF 0x47c // 11.8 tested on 520.56.06 #define CU_12_0_MASK_OFF 0x4cc // CUDA 12.0 and 12.1 use the same offset // 12.0 tested on 525.147.05 #define CU_12_2_MASK_OFF 0x4e4 // 12.2 tested on 535.129.03 #define CU_12_3_MASK_OFF 0x49c // 12.3 tested on 545.29.06 #define CU_12_4_MASK_OFF 0x4ac // 12.4 tested on 550.54.14 and 550.54.15 #define CU_12_5_MASK_OFF 0x4ec // CUDA 12.5 and 12.6 use the same offset // 12.5 tested on 555.58.02 // 12.6 tested on 560.35.03 #define CU_12_7_MASK_OFF 0x4fc // CUDA 12.7 and 12.8 use the same offset // 12.7 tested on 565.77 // 12.8 tested on 570.124.06 // Offsets for the stream struct on Jetson aarch64 #define CU_9_0_MASK_OFF_JETSON 0x128 // 9.0 tested on Jetpack 3.x (TX2, Nov 2023) #define CU_10_2_MASK_OFF_JETSON 0x24c // 10.2 tested on Jetpack 4.x (AGX Xaver and TX2, Nov 2023) #define CU_11_4_MASK_OFF_JETSON 0x394 // 11.4 tested on Jetpack 5.x (AGX Orin, Nov 2023) // TODO: 11.8, 12.0, 12.1, and 12.2 on Jetpack 5.x via compatibility packages #define CU_12_2_MASK_OFF_JETSON 0x50c // 12.2 tested on Jetpack 6.x (AGX Orin, Dec 2024) #define CU_12_4_MASK_OFF_JETSON 0x4c4 // 12.4 tested on Jetpack 6.x with cuda-compat-12-4 (AGX Orin, Dec 2024) #define CU_12_5_MASK_OFF_JETSON 0x50c // 12.5 tested on Jetpack 6.x with cuda-compat-12-5 (AGX Orin, Dec 2024) #define CU_12_6_MASK_OFF_JETSON 0x514 // 12.6 tested on Jetpack 6.x with cuda-compat-12-6 (AGX Orin, Dec 2024) // Used up through CUDA 11.8 in the stream struct struct stream_sm_mask { uint32_t upper; uint32_t lower; }; // Used starting with CUDA 12.0 in the stream struct struct stream_sm_mask_v2 { uint32_t enabled; uint32_t mask[4]; }; // Check if this system has a Parker SoC (TX2/PX2 chip) // (CUDA 9.0 behaves slightly different on this platform.) // @return 1 if detected, 0 if not, -cuda_err on error #if __aarch64__ static int detect_parker_soc() { int cap_major, cap_minor, err, dev_count; if (err = cuDeviceGetCount(&dev_count)) return -err; // As CUDA devices are numbered by order of compute power, check every // device, in case a powerful discrete GPU is attached (such as on the // DRIVE PX2). We detect the Parker SoC via its unique CUDA compute // capability: 6.2. for (int i = 0; i < dev_count; i++) { if (err = cuDeviceGetAttribute(&cap_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i)) return -err; if (err = cuDeviceGetAttribute(&cap_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i)) return -err; if (cap_major == 6 && cap_minor == 2) return 1; } return 0; } #endif // __aarch64__ // Should work for CUDA 8.0 through 12.8 // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in // our header void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { // When the old API is used on GPUs with over 64 TPCs, disable all TPCs >64 uint128_t full_mask = -1; full_mask <<= 64; full_mask |= mask; libsmctrl_set_stream_mask_ext(stream, full_mask); } void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) { char* stream_struct_base = *(char**)stream; struct stream_sm_mask* hw_mask = NULL; struct stream_sm_mask_v2* hw_mask_v2 = NULL; int ver; cuDriverGetVersion(&ver); switch (ver) { #if __x86_64__ case 8000: hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); case 9000: case 9010: { hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); break; } case 9020: hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF); break; case 10000: case 10010: case 10020: hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_0_MASK_OFF); break; case 11000: hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_0_MASK_OFF); break; case 11010: hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_1_MASK_OFF); break; case 11020: case 11030: case 11040: case 11050: hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_2_MASK_OFF); break; case 11060: hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_6_MASK_OFF); break; case 11070: hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_7_MASK_OFF); break; case 11080: hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF); break; case 12000: case 12010: hw_mask_v2 = (void*)(stream_struct_base + CU_12_0_MASK_OFF); break; case 12020: hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF); break; case 12030: hw_mask_v2 = (void*)(stream_struct_base + CU_12_3_MASK_OFF); break; case 12040: hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF); break; case 12050: case 12060: hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF); break; case 12070: case 12080: hw_mask_v2 = (void*)(stream_struct_base + CU_12_7_MASK_OFF); break; #elif __aarch64__ case 9000: { // Jetson TX2 offset is slightly different on CUDA 9.0. // Only compile the check into ARM64 builds. // TODO: Always verify Jetson-board-only on aarch64. int is_parker; const char* err_str; if ((is_parker = detect_parker_soc()) < 0) { cuGetErrorName(-is_parker, &err_str); abort(1, 0, "While performing platform-specific " "compatibility checks for stream masking, " "CUDA call failed with error '%s'.", err_str); } if (!is_parker) abort(1, 0, "Not supported on non-Jetson aarch64."); hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_JETSON); break; } case 10020: hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_2_MASK_OFF_JETSON); break; case 11040: hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON); break; case 12020: hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF_JETSON); break; case 12040: hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF_JETSON); break; case 12050: hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF_JETSON); break; case 12060: hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF_JETSON); break; #endif } // For experimenting to determine the right mask offset, set the MASK_OFF // environment variable (positive and negative numbers are supported) char* mask_off_str = getenv("MASK_OFF"); if (mask_off_str) { int off = atoi(mask_off_str); fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.2 base %#x " "(total off: %#x)\n", off, CU_12_2_MASK_OFF, CU_12_2_MASK_OFF + off); if (CU_12_2_MASK_OFF + off < 0) abort(1, 0, "Total offset cannot be less than 0! Aborting..."); // +4 bytes to convert a mask found with this for use with hw_mask hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF + off); } // Mask layout changed with CUDA 12.0 to support large Hopper/Ada GPUs if (hw_mask) { hw_mask->upper = mask >> 32; hw_mask->lower = mask; } else if (hw_mask_v2) { hw_mask_v2->enabled = 1; hw_mask_v2->mask[0] = mask; hw_mask_v2->mask[1] = mask >> 32; hw_mask_v2->mask[2] = mask >> 64; hw_mask_v2->mask[3] = mask >> 96; } else { abort(1, 0, "Stream masking unsupported on this CUDA version (%d), and" " no fallback MASK_OFF set!", ver); } } /*** TPC and GPU Informational Functions ***/ // Read an integer from a file in `/proc` static int read_int_procfile(char* filename, uint64_t* out) { char f_data[18] = {0}; size_t ret; int fd = open(filename, O_RDONLY); if (fd == -1) return errno; ret = read(fd, f_data, 18); if (ret == -1) return errno; close(fd); *out = strtoll(f_data, NULL, 16); return 0; } // We support up to 128 TPCs, up to 12 GPCs per GPU, and up to 16 GPUs. #define MAX_GPCS 12 static uint64_t tpc_mask_per_gpc_per_dev[16][MAX_GPCS]; static uint128_t tpc_mask_per_gpc_per_dev_ext[16][MAX_GPCS]; // Output mask is vtpc-indexed (virtual TPC) // Note that this function has to undo _both_ floorsweeping and ID remapping int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) { int err, i; uint128_t *tpcs_for_gpc_ext; if ((err = libsmctrl_get_gpc_info_ext(num_enabled_gpcs, &tpcs_for_gpc_ext, dev))) return err; for (i = 0; i < *num_enabled_gpcs; i++) { if ((tpcs_for_gpc_ext[i] & -1ull) != tpcs_for_gpc_ext[i]) return ERANGE; tpc_mask_per_gpc_per_dev[dev][i] = (uint64_t)tpcs_for_gpc_ext[i]; } *tpcs_for_gpc = tpc_mask_per_gpc_per_dev[dev]; return 0; } int libsmctrl_get_gpc_info_ext(uint32_t* num_enabled_gpcs, uint128_t** tpcs_for_gpc, int dev) { uint32_t i, j, tpc_id, gpc_id, num_enabled_tpcs, num_configured_tpcs; uint64_t gpc_mask, num_tpc_per_gpc, max_gpcs, gpc_tpc_mask, gpc_tpc_config, total_read = 0; uint128_t tpc_bit; int err; char filename[100]; *num_enabled_gpcs = 0; // Maximum number of GPCs supported for this chip snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev); if (err = read_int_procfile(filename, &max_gpcs)) { fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before " "using libsmctrl_get_*_info() functions\n"); return err; } // TODO: handle arbitrary-size GPUs if (dev > 16 || max_gpcs > 12) { fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n"); return ERANGE; } // Set bit = disabled GPC snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev); if (err = read_int_procfile(filename, &gpc_mask)) return err; // Determine the number of enabled TPCs snprintf(filename, 100, "/proc/gpu%d/num_tpc_per_gpc", dev); if (err = read_int_procfile(filename, &num_tpc_per_gpc)) return err; // For each enabled GPC num_enabled_tpcs = 0; for (i = 0; i < max_gpcs; i++) { // Skip this GPC if disabled if ((1 << i) & gpc_mask) continue; (*num_enabled_gpcs)++; // Get the bitstring of TPCs disabled for this physical GPC // Set bit = disabled TPC snprintf(filename, 100, "/proc/gpu%d/gpc%d_tpc_mask", dev, i); if (err = read_int_procfile(filename, &gpc_tpc_mask)) return err; // Bits greater than the max number of TPCs should be ignored, so only // keep the `num_tpc_per_gpc`-count number of lower bits. gpc_tpc_mask &= -1u >> (64 - num_tpc_per_gpc); // Number of enabled TPCs = max - number disabled num_enabled_tpcs += num_tpc_per_gpc - __builtin_popcountl(gpc_tpc_mask); } // Clear any previous mask for (i = 0; i < MAX_GPCS; i++) tpc_mask_per_gpc_per_dev_ext[dev][i] = 0; // For each enabled TPC for (tpc_id = 0; tpc_id < num_enabled_tpcs;) { // Pull mapping for the next set of 4 TPCs snprintf(filename, 100, "/proc/gpu%d/CWD_GPC_TPC_ID%d", dev, tpc_id / 4); if (err = read_int_procfile(filename, &gpc_tpc_config)) return err; total_read += gpc_tpc_config; for (j = 0; j < 4 && tpc_id < num_enabled_tpcs; j++, tpc_id++) { // Set the bit for the current TPC tpc_bit = 1; tpc_bit <<= tpc_id; // Determine which GPC the current TPC is associated with // (upper 4 bits of each byte) gpc_id = (gpc_tpc_config >> (j*8 + 4) & 0xfu); // Save mapping tpc_mask_per_gpc_per_dev_ext[dev][gpc_id] |= tpc_bit; } } // Verify each TPC is configured tpc_bit = 0; for (i = 0; i < MAX_GPCS; i++) tpc_bit |= tpc_mask_per_gpc_per_dev_ext[dev][i]; num_configured_tpcs = __builtin_popcountl(tpc_bit) + __builtin_popcountl(tpc_bit >> 64); if (num_configured_tpcs != num_enabled_tpcs) { fprintf(stderr, "libsmctrl: Found configuration for only %d TPCs when %d were expected.\n", num_configured_tpcs, num_enabled_tpcs); return EIO; } // Verify that the configuration was not always zero (indicates a powered- // -off GPU). if (total_read == 0) { fprintf(stderr, "libsmctrl: Is GPU on? Configuration registers are all zero.\n"); return EIO; } *tpcs_for_gpc = tpc_mask_per_gpc_per_dev_ext[dev]; return 0; } int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { uint32_t num_gpcs; uint128_t* tpcs_per_gpc; int res, gpc; if (res = libsmctrl_get_gpc_info_ext(&num_gpcs, &tpcs_per_gpc, dev)) return res; *num_tpcs = 0; for (gpc = 0; gpc < num_gpcs; gpc++) { *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]); *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc] >> 64); } return 0; } // @param dev Device index as understood by CUDA **can differ from nvdebug idx** // This implementation is fragile, and could be incorrect for odd GPUs int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) { int num_sms, sms_per_tpc, major, minor, res = 0; const char* err_str; if (res = cuInit(0)) goto abort_cuda; if (res = cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_dev)) goto abort_cuda; if (res = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_dev)) goto abort_cuda; if (res = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_dev)) goto abort_cuda; // SM masking only works on sm_35+ if (major < 3 || (major == 3 && minor < 5)) return ENOTSUP; // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well // as the P100, which is uniquely sm_60 if (major > 6 || (major == 6 && minor == 0)) sms_per_tpc = 2; else sms_per_tpc = 1; // It looks like there may be some upcoming weirdness (TPCs with only one SM?) // with Hopper if (major >= 9) fprintf(stderr, "libsmctrl: WARNING, TPC masking is untested on Hopper," " and will likely yield incorrect results! Proceed with caution.\n"); *num_tpcs = num_sms/sms_per_tpc; return 0; abort_cuda: cuGetErrorName(res, &err_str); fprintf(stderr, "libsmctrl: CUDA call failed due to %s. Failing with EIO...\n", err_str); return EIO; } /*** Private functions for nvtaskset and building as a libcuda.so.1 wrapper ***/ // Check if NVIDIA MPS is running, following the process that `strace` shows // `nvidia-cuda-mps-control` to use. MPS is a prerequisite to co-running // multiple GPU-using tasks without timeslicing. bool libsmctrl_is_mps_running() { char *mps_pipe_dir; int mps_ctrl; struct sockaddr_un mps_ctrl_addr; mps_ctrl_addr.sun_family = AF_UNIX; const int yes = 1; if (!(mps_pipe_dir = getenv("CUDA_MPS_PIPE_DIRECTORY"))) mps_pipe_dir = "/tmp/nvidia-mps"; // Pipe names are limited to 108 characters long snprintf(mps_ctrl_addr.sun_path, 108, "%s/control", mps_pipe_dir); // This mirrors the process `nvidia-cuda-mps-control` uses to detect MPS if ((mps_ctrl = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1) return false; if (setsockopt(mps_ctrl, SOL_SOCKET, SO_PASSCRED, &yes, sizeof(yes)) == -1) return false; if (connect(mps_ctrl, &mps_ctrl_addr, sizeof(struct sockaddr_un)) == -1) return false; close(mps_ctrl); return true; } // A variant of strtoul with support for 128-bit integers uint128_t strtou128(const char *nptr, char **endptr, int base) { unsigned __int128 result = 0; if (base != 16) error(1, EINVAL, "strtou128 only supports base 16"); // Skip a "0x" prefix. Safe due to early evaluation if (*nptr == '0' && (*(nptr + 1) == 'x' || *(nptr + 1) == 'X')) nptr += 2; // Until hitting an invalid character while (1) { if (*nptr >= 'a' && *nptr <= 'f') result = result << 4 | (*nptr - 'a' + 10); else if (*nptr >= 'A' && *nptr <= 'F') result = result << 4 | (*nptr - 'A' + 10); else if (*nptr >= '0' && *nptr <= '9') result = result << 4 | (*nptr - '0'); else break; nptr++; } if (endptr) *endptr = (char*)nptr; return result; } #ifdef LIBSMCTRL_WRAPPER // The CUDA runtime library uses dlopen() to load CUDA functions from // libcuda.so.1. Since we replace that with our wrapper library, we need to // also redirect any attempted opens of that shared object to the actual // shared library, which is linked to by libcuda.so. void *dlopen(const char *filename, int flags) { if (filename && strcmp(filename, "libcuda.so") == 0) { fprintf(stderr, "redirecting dlopen of %s to libcuda.so\n", filename); // A GNU-only dlopen variant return dlmopen(LM_ID_BASE, "libcuda.so", flags); } else return dlmopen(LM_ID_BASE, filename, flags); } // Allow setting a default mask via an environment variable // Also enables libsmctrl to be used on unmodified programs via setting: // LD_LIBRARY_PATH=libsmctrl LIBSMCTRL_MASK= ./my_program // Where "" is replaced with a disable mask, optionally prefixed // with a ~ to invert it (make it an enable mask). __attribute__((constructor)) static void setup(void) { char *end, *mask_str; // If dynamic changes are disabled (due to an error) this variable is // permanently used to store the supreme mask, rather than the shared // memory segment. static uint128_t mask; bool invert = false; int fd; mask_str = getenv("LIBSMCTRL_MASK"); // Assume no mask if unspecified if (!mask_str) mask_str = "0"; if (*mask_str == '~') { invert = true; mask_str++; } mask = strtou128(mask_str, &end, 16); // Verify we were able to parse the whole string if (*end != '\0') abort(1, EINVAL, "Unable to apply default mask"); if (invert) mask = ~mask; // Explictly set the number of channels (if unset), otherwise CUDA will only // use two with MPS (see paper for why that causes problems) if (setenv("CUDA_DEVICE_MAX_CONNECTIONS", "8", 0) == -1) abort(1, EINVAL, "Unable to configure environment"); // Warn if a mask was specified but MPS isn't running if (mask && !libsmctrl_is_mps_running()) fprintf(stderr, "libsmctrl-libcuda-wrapper: Warning: TPC mask set via LIBSMCTRL_MASK, but NVIDIA MPS is not running. CUDA programs will not co-run!\n"); // Initialize CUDA and the interception callback setup_sm_control_callback(); // Create shared memory region for the supreme mask such that nvtaskset // can read and modify it fd = memfd_create("libsmctrl", MFD_CLOEXEC); if (fd == -1) { abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled"); g_supreme_sm_mask = &mask; return; } if (ftruncate(fd, 16) == -1) { abort(0, errno, "Unable to resize shared memory for dynamic partition changes. Dynamic changes disabled"); g_supreme_sm_mask = &mask; return; } if ((g_supreme_sm_mask = mmap(NULL, 16, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)) == MAP_FAILED) { abort(0, errno, "Unable to map shared memory for dynamic partition changes. Dynamic changes disabled"); g_supreme_sm_mask = &mask; return; } // Set the super-global mask which cannot be overwritten by any libsmctrl // API function. *g_supreme_sm_mask = mask; } #elif defined(LIBSMCTRL_STATIC) // If this library is statically built into a program, and the libcuda.so.1 // wrapper is enabled, we force the staticlly linked version of the library // to defer to the function implementations in the wrapper. // // Longer explanation: // If the library has been dynamically linked into a program and the wrapper // is in use, the loader will point both to the same set of symbols (since both // will do a dynamic lookup at load-time, the global state at the top of this // file uses global linkage, and will thus be in the dynamic symbol table, and // each lookup will find the same copy.) // Symbols from a staticlly linked library are not included in the dynamic // symbol table, and thus can exist in duplicate of those in any shared // library. This is a problem, since only one callback function, using one set // of global variables can be registered with CUDA. We work around this by // having our statically linked library use the functions from the wrapper or // any shared library, if one such instance is loaded. __attribute__((constructor)) static void setup(void) { // dlsym can only view the dynamic symbol tables, and so these lookups will // fail if neither the wrapper (libcuda.so.1) nor libsmctrl.so are loaded. // (That indicates that we should the static library implementations.) // These are a NOP on failure since they return NULL when not found. shared_set_next_mask = dlsym(RTLD_DEFAULT, "libsmctrl_set_next_mask"); shared_set_global_mask = dlsym(RTLD_DEFAULT, "libsmctrl_set_global_mask"); } #endif