From d052c2df34ab41ba285f70965663e5a0832f6ac9 Mon Sep 17 00:00:00 2001 From: Joshua Bakita Date: Thu, 19 Dec 2024 14:20:38 -0500 Subject: Bugfix stream-mask override, support old CUDA, and start Hopper support Use a different callback to intercept the TMD/QMD later in the launch pipeline. Major improvements: - Fix bug with next mask not overriding stream mask on CUDA 11.0+ - Add CUDA 6.5-10.2 support for next- and global-granularity partitioning masks on x86_64 and aarch64 Jetson - Remove libdl dependency - Partially support TMD/QMD Version 4 (Hopper) Minor improvements: - Check for sufficient CUDA version before before attempting to apply a next-granularity partitioning mask - Only check for sufficient CUDA version on the first call to `libsmctrl_set_next_mask()` or `libsmctrl_set_global_mask()`, rather than checking every time (lowers overheads) - Check that TMD version is sufficient before modifying it - Improve documentation Issues: - Partitioning mask bits have a different meaning in TMD/QMD Version 4 and require floorsweeping and remapping information to properly construct. This information will be forthcoming in future releases of libsmctrl and nvdebug. --- Makefile | 2 +- README.md | 6 +- libsmctrl.c | 247 +++++++++++++++++++++--------------------------------------- libsmctrl.h | 6 +- 4 files changed, 92 insertions(+), 169 deletions(-) diff --git a/Makefile b/Makefile index 0e9ee3a..0d9b9f6 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CXX = g++ NVCC ?= nvcc # -fPIC is needed in all cases, as we may be linked into another shared library CFLAGS = -fPIC -LDFLAGS = -lcuda -I/usr/local/cuda/include -ldl +LDFLAGS = -lcuda -I/usr/local/cuda/include .PHONY: clean tests diff --git a/README.md b/README.md index ce32b19..c3f87c4 100644 --- a/README.md +++ b/README.md @@ -93,18 +93,16 @@ make tests #### Known Working - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs -- CUDA 8.0 through 12.6 +- CUDA 6.5 through 12.6 - `x86_64` and Jetson `aarch64` platforms #### Known Issues -- `next_mask` will not override `stream_mask` on CUDA 11.0+ - - _As of Feb 2024, a fix for this is coming soon..._ - `global_mask` and `next_mask` cannot disable TPCs with IDs above 128 - Only relevant on GPUs with over 128 TPCs, such as the RTX 6000 Ada -- Untested on H100 (compute capability 9.0) - Untested on non-Jetson `aarch64` platforms - Untested on CUDA 11.8, 12.0, and 12.1 on Jetson `aarch64` +- Mask bit indexes do not directly correlate to software-visible TPC/SM IDs in V4 TMD/QMDs (Hopper+; compute capability 9.0). The mask bit indexes instead appear to correspond to on-chip-units, including disabled ones; i.e. the set of pre-SM-ID-remapping and pre-floorsweeping TPCs ## Important Limitations diff --git a/libsmctrl.c b/libsmctrl.c index 1018e44..24a3177 100644 --- a/libsmctrl.c +++ b/libsmctrl.c @@ -5,31 +5,34 @@ * * This file implements partitioning via three different mechanisms: * - Modifying the QMD/TMD immediately prior to upload - * - Changing a field in CUDA's global struct that CUDA applies to the QMD/TMD * - Changing a field in CUDA's stream struct that CUDA applies to the QMD/TMD * This table shows the mechanism used with each CUDA version: * +-----------+---------------+---------------+--------------+ * | Version | Global Mask | Stream Mask | Next Mask | * +-----------+---------------+---------------+--------------+ - * | 11.0-12.6 | TMD/QMD Hook | stream struct | TMD/QMD Hook | - * | 10.2 | global struct | stream struct | N/A | - * | 8.0-10.1 | N/A | stream struct | N/A | + * | 8.0-12.6 | TMD/QMD Hook | stream struct | TMD/QMD Hook | + * | 6.5-7.5 | TMD/QMD Hook | N/A | TMD/QMD Hook | * +-----------+---------------+---------------+--------------+ * "N/A" indicates that a mask type is unsupported on that CUDA version. * Please contact the authors if support is needed for a particular feature on * an older CUDA version. Support for those is unimplemented, not impossible. + * + * An old implementation of this file effected the global mask on CUDA 10.2 by + * changing a field in CUDA's global struct that CUDA applies to the QMD/TMD. + * That implementation was extraordinarily complicated, and was replaced in + * 2024 with a more-backward-compatible way of hooking the TMD/QMD. + * View the old implementation via Git: `git show aa63a02e:libsmctrl.c`. */ #include #include #include #include +#include #include #include #include -#include - #include "libsmctrl.h" // In functions that do not return an error code, we favor terminating with an @@ -37,148 +40,66 @@ #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ __VA_ARGS__) -// Layout of mask control fields to match CUDA's static global struct -struct global_sm_control { - uint32_t enabled; - uint64_t mask; -} __attribute__((packed)); - -/*** CUDA Globals Manipulation. CUDA 10.2 only ***/ - -// Ends up being 0x7fb7fa3408 in some binaries (CUDA 10.2, Jetson) -static struct global_sm_control* g_sm_control = NULL; - -/* Find the location of CUDA's `globals` struct and the SM mask control fields - * No symbols are exported from within `globals`, so this has to do a very - * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`. - * Don't call this before the CUDA library has been initialized. - * (Note that this appears to work, even if built on CUDA > 10.2.) - */ -static void setup_g_sm_control_10() { - if (g_sm_control) - return; - // The location of the static global struct containing the global SM - // mask field will vary depending on where the loader locates the CUDA - // library. In order to reliably modify this struct, we must defeat - // that relocation by deriving its location relative to a known - // reference point. - // - // == Choosing a Reference Point: - // The cudbg* symbols appear to be relocated to a constant offset from - // the globals structure, and so we use the address of the symbol - // `cudbgReportDriverApiErrorFlags` as our reference point. (This ends - // up being the closest to an intermediate table we use as part of our - // lookup---process discussed below.) - // - // Unfortunately, the symbol we reference is errantly omitted from the - // libcuda.so stub used by nvcc starting around CUDA 11.8, so we have to - // use dlsym to avoid build-time issues. - void* hndl = dlopen(NULL, RTLD_LAZY); - uint32_t* sym = dlsym(hndl, "cudbgReportDriverApiErrorFlags"); - - // == Deriving Location: - // The number of CUDA devices available is co-located in the same CUDA - // globals structure that we aim to modify the SM mask field in. The - // value in that field can be assigned to a user-controlled pointer via - // the cuDeviceGetCount() CUDA Driver Library function. To determine - // the location of thu structure, we pass a bad address to the function - // and dissasemble the code adjacent to where it segfaults. On the - // Jetson Xavier with CUDA 10.2, the assembly is as follows: - // (reg x19 contains cuDeviceGetCount()'s user-provided pointer) - // ... - // 0x0000007fb71454b4: cbz x19, 0x7fb71454d0 // Check ptr non-zero - // 0x0000007fb71454b8: adrp x1, 0x7fb7ea6000 // Addr of lookup tbl - // 0x0000007fb71454bc: ldr x1, [x1,#3672] // Get addr of globals - // 0x0000007fb71454c0: ldr w1, [x1,#904] // Get count from globals - // 0x0000007fb71454c4: str w1, [x19] // Store count at user addr - // ... - // In this assembly, we can identify that CUDA uses an internal lookup - // table to identify the location of the globals structure (pointer - // 459 in the table; offset 3672). After obtaining this pointer, it - // advances to offset 904 in the global structure, dereferences the - // value stored there, and then attempts to store it at the user- - // -provided address (register x19). This final line will trigger a - // segfault if a non-zero bad address is passed to cuDeviceGetCount(). - // - // On x86_64: - // (reg %rbx contains cuDeviceGetCount()'s user-provided pointer) - // ... - // 0x00007ffff6cac01f: test %rbx,%rbx // Check ptr non-zero - // 0x00007ffff6cac022: je 0x7ffff6cac038 // '' - // 0x00007ffff6cac024: mov 0x100451d(%rip),%rdx # 0x7ffff7cb0548 // Get globals base address from offset from instruction pointer - // 0x00007ffff6cac02b: mov 0x308(%rdx),%edx // Take globals base address, add an offset of 776, and dereference - // 0x00007ffff6cac031: mov %edx,(%rbx) // Store count at user addr - // ... - // Note that this does not use an intermediate lookup table. - // - // [Aside: cudbgReportDriverApiErrorFlags is currently the closest - // symbol to **the lookup table**. cudbgDebuggerInitialized is closer - // to the globals struct itself (+7424 == SM mask control), but we - // perfer the table lookup approach for now, as that's what - // cuDeviceGetCount() does.] - -#if __aarch64__ - // In my test binary, the lookup table is at address 0x7fb7ea6000, and - // this is 1029868 bytes before the address for - // cudbgReportDriverApiErrorFlags. Use this information to derive the - // location of the lookup in our binary (defeat relocation). - uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868); - // Address of `globals` is at offset 3672 (entry 459?) in the table - uintptr_t globals_addr = *(tbl_base + 459); - // SM mask control is at offset 4888 in the `globals` struct - // [Device count at offset 904 (0x388)] - g_sm_control = (struct global_sm_control*)(globals_addr + 4888); -#endif // __aarch64__ -#if __x86_64__ - // In my test binary, globals is at 0x7ffff7cb0548, which is 1103576 - // bytes before the address for cudbgReportDriverApiErrorFlags - // (0x7ffff7dbdc20). Use this offset to defeat relocation. - uintptr_t globals_addr = *(uintptr_t*)((uintptr_t)sym - 1103576); - // SM mask control is at offset 4728 in the `globals` struct - // [Device count at offset 776 (0x308)] - g_sm_control = (struct global_sm_control*)(globals_addr + 4728); -#endif // __x86_64__ - // SM mask should be empty by default - if (g_sm_control->enabled || g_sm_control->mask) - fprintf(stderr, "Warning: Found non-empty SM disable mask " - "during setup! libsmctrl_set_global_mask() is " - "unlikely to work on this platform!\n"); -} - -/*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ +/*** QMD/TMD-based SM Mask Control via Debug Callback. ***/ -// Tested working on CUDA x86_64 11.0-12.2. -// Tested not working on aarch64 or x86_64 10.2 +// Tested working on x86_64 CUDA 6.5, 9.1, and various 10+ versions +// (No testing attempted on pre-CUDA-6.5 versions) +// Values for the following three lines can be extracted by tracing CUPTI as +// it interects with libcuda.so to set callbacks. static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; -#define LAUNCH_DOMAIN 0x3 -#define LAUNCH_PRE_UPLOAD 0x3 +// These callback descriptors appear to intercept the TMD/QMD late enough that +// CUDA has already applied the per-stream mask from its internal data +// structures, allowing us to override it with the next mask. +#define QMD_DOMAIN 0xb +#define QMD_PRE_UPLOAD 0x1 +// Global mask (applies across all threads) static uint64_t g_sm_mask = 0; +// Next mask (applies per-thread) static __thread uint64_t g_next_sm_mask = 0; -static char sm_control_setup_called = 0; -static void launchCallback(void *ukwn, int domain, int cbid, const void *in_params) { - // The third 8-byte element in `in_parms` is a pointer to the stream struct. - // This exists even when in_params < 0x50. This could be used to implement - // stream masking without the manual offsets specified elsewhere (store a - // table of stream pointers to masks and do a lookup here). - // It could also be used (although not as easily) to support global and next - // masking on old CUDA versions, but that would require hooking earlier in the - // launch process (before the stream mask is applied). - if (*(uint32_t*)in_params < 0x50) +// Flag value to indicate if setup has been completed +static bool sm_control_setup_called = false; + +// v1 has been removed---it intercepted the TMD/QMD too early, making it +// impossible to override the CUDA-injected stream mask with the next mask. +static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in_params) { + // ***Only tested on platforms with 64-bit pointers.*** + // The first 8-byte element in `in_params` appears to be its size. `in_params` + // must have at least five 8-byte elements for index four to be valid. + if (*(uint32_t*)in_params < 5 * sizeof(void*)) abort(1, 0, "Unsupported CUDA version for callback-based SM masking. Aborting..."); - // The eighth 8-byte element in `in_params` is a pointer to a struct which - // contains a pointer to the TMD as its first element. Note that this eighth - // pointer must exist---it only exists when the first 8-byte element of - // `in_params` is at least 0x50 (checked above). - void* tmd = **((uintptr_t***)in_params + 8); + // The fourth 8-byte element in `in_params` is a pointer to the TMD. Note + // that this fourth pointer must exist---it only exists when the first + // 8-byte element of `in_params` is at least 0x28 (checked above). + void* tmd = *((void**)in_params + 4); if (!tmd) abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n"); - //fprintf(stderr, "cta: %lx\n", *(uint64_t*)(tmd + 74)); - // TODO: Check for supported QMD version (>XXX, <4.00) - // TODO: Support QMD version 4 (Hopper), where offset starts at +304 (rather than +84) and is 16 bytes (rather than 8 bytes) wide. It also requires an enable bit at +31bits. - uint32_t *lower_ptr = tmd + 84; - uint32_t *upper_ptr = tmd + 88; + uint32_t *lower_ptr, *upper_ptr; + + // The location of the TMD version field seems consistent across versions + uint8_t tmd_ver = *(uint8_t*)(tmd + 72); + + if (tmd_ver >= 0x40) { + // TMD V04_00 is used starting with Hopper to support masking >64 TPCs + lower_ptr = tmd + 304; + upper_ptr = tmd + 308; + // XXX: Disable upper 64 TPCs until we have ...next_mask_ext and + // ...global_mask_ext + *(uint32_t*)(tmd + 312) = -1; + *(uint32_t*)(tmd + 316) = -1; + // An enable bit is also required + *(uint32_t*)tmd |= 0x80000000; + } else if (tmd_ver >= 0x16) { + // TMD V01_06 is used starting with Kepler V2, and is the first to + // support TPC masking + lower_ptr = tmd + 84; + upper_ptr = tmd + 88; + } else { + // TMD V00_06 is documented to not support SM masking + abort(1, 0, "TMD version %04o is too old! This GPU does not support SM masking.\n", tmd_ver); + } + // Setting the next mask overrides both per-stream and global masks if (g_next_sm_mask) { *lower_ptr = (uint32_t)g_next_sm_mask; *upper_ptr = (uint32_t)(g_next_sm_mask >> 32); @@ -188,11 +109,12 @@ static void launchCallback(void *ukwn, int domain, int cbid, const void *in_para *lower_ptr = (uint32_t)g_sm_mask; *upper_ptr = (uint32_t)(g_sm_mask >> 32); } - //fprintf(stderr, "lower mask: %x\n", *lower_ptr); - //fprintf(stderr, "upper mask: %x\n", *upper_ptr); + + //fprintf(stderr, "Final SM Mask (lower): %x\n", *lower_ptr); + //fprintf(stderr, "Final SM Mask (upper): %x\n", *upper_ptr); } -static void setup_sm_control_11() { +static void setup_sm_control_callback() { int (*subscribe)(uint32_t* hndl, void(*callback)(void*, int, int, const void*), void* ukwn); int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid); uintptr_t* tbl_base; @@ -207,38 +129,41 @@ static void setup_sm_control_11() { subscribe = (typeof(subscribe))subscribe_func_addr; enable = (typeof(enable))enable_func_addr; int res = 0; - res = subscribe(&my_hndl, launchCallback, NULL); + res = subscribe(&my_hndl, control_callback_v2, NULL); if (res) abort(1, 0, "Error subscribing to launch callback. CUDA returned error code %d.", res); - res = enable(1, my_hndl, LAUNCH_DOMAIN, LAUNCH_PRE_UPLOAD); + res = enable(1, my_hndl, QMD_DOMAIN, QMD_PRE_UPLOAD); if (res) abort(1, 0, "Error enabling launch callback. CUDA returned error code %d.", res); } // Set default mask for all launches void libsmctrl_set_global_mask(uint64_t mask) { - int ver; - cuDriverGetVersion(&ver); - if (ver == 10020) { - if (!g_sm_control) - setup_g_sm_control_10(); - g_sm_control->mask = mask; - g_sm_control->enabled = 1; - } else if (ver > 10020) { - if (!sm_control_setup_called) - setup_sm_control_11(); - g_sm_mask = mask; - } else { // < CUDA 10.2 - abort(1, ENOSYS, "Global masking requires at least CUDA 10.2; " - "this application is using CUDA %d.%d", - ver / 1000, (ver % 100)); + if (!sm_control_setup_called) { + // The version will not change while running, so only check once + int ver = 0; + cuDriverGetVersion(&ver); + if (ver < 6050) { + abort(1, ENOSYS, "Global masking requires at least CUDA 6.5; " + "this application is using CUDA %d.%d", + ver / 1000, (ver % 100)); + setup_sm_control_callback(); } + g_sm_mask = mask; } // Set mask for next launch from this thread void libsmctrl_set_next_mask(uint64_t mask) { - if (!sm_control_setup_called) - setup_sm_control_11(); + if (!sm_control_setup_called) { + // The version will not change while running, so only check once + int ver = 0; + cuDriverGetVersion(&ver); + if (ver < 6050) { + abort(1, ENOSYS, "Next masking requires at least CUDA 6.5; " + "this application is using CUDA %d.%d", + ver / 1000, (ver % 100)); + setup_sm_control_callback(); + } g_next_sm_mask = mask; } diff --git a/libsmctrl.h b/libsmctrl.h index 6285de6..b85c0c7 100644 --- a/libsmctrl.h +++ b/libsmctrl.h @@ -1,5 +1,5 @@ /** - * Copyright 2024 Joshua Bakita + * Copyright 2022-2024 Joshua Bakita * Library to control TPC masks on CUDA launches. Co-opts preexisting debug * logic in the CUDA driver library, and thus requires a build with -lcuda. */ @@ -15,7 +15,7 @@ typedef unsigned __int128 uint128_t; // Set global default TPC mask for all kernels, incl. CUDA-internal ones // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) -// Supported: CUDA 10.2, and CUDA 11.0 - CUDA 12.1 +// Supported: CUDA 6.5 - CUDA 12.6 extern void libsmctrl_set_global_mask(uint64_t mask); // Set default TPC mask for all kernels launched via `stream` // (overrides global mask) @@ -27,7 +27,7 @@ extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); // Set TPC mask for the next kernel launch from the caller's CPU thread // (overrides global and per-stream masks, applies only to next launch). // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) -// Supported: CUDA 11.0 - CUDA 12.1 +// Supported: CUDA 6.5 - CUDA 12.6 extern void libsmctrl_set_next_mask(uint64_t mask); /** -- cgit v1.2.2