diff options
author | Joshua Bakita <jbakita@cs.unc.edu> | 2024-12-19 14:20:38 -0500 |
---|---|---|
committer | Joshua Bakita <jbakita@cs.unc.edu> | 2024-12-19 14:48:21 -0500 |
commit | d052c2df34ab41ba285f70965663e5a0832f6ac9 (patch) | |
tree | 0a761be3f62910275da8a2cad546a8902073b1e9 | |
parent | aa63a02efa5fc8701f0c3418704bbbc2051c1042 (diff) |
Bugfix stream-mask override, support old CUDA, and start Hopper support
Use a different callback to intercept the TMD/QMD later in the
launch pipeline.
Major improvements:
- Fix bug with next mask not overriding stream mask on CUDA 11.0+
- Add CUDA 6.5-10.2 support for next- and global-granularity
partitioning masks on x86_64 and aarch64 Jetson
- Remove libdl dependency
- Partially support TMD/QMD Version 4 (Hopper)
Minor improvements:
- Check for sufficient CUDA version before before attempting to
apply a next-granularity partitioning mask
- Only check for sufficient CUDA version on the first call to
`libsmctrl_set_next_mask()` or `libsmctrl_set_global_mask()`,
rather than checking every time (lowers overheads)
- Check that TMD version is sufficient before modifying it
- Improve documentation
Issues:
- Partitioning mask bits have a different meaning in TMD/QMD
Version 4 and require floorsweeping and remapping information to
properly construct. This information will be forthcoming in
future releases of libsmctrl and nvdebug.
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | README.md | 6 | ||||
-rw-r--r-- | libsmctrl.c | 247 | ||||
-rw-r--r-- | libsmctrl.h | 6 |
4 files changed, 92 insertions, 169 deletions
@@ -3,7 +3,7 @@ CXX = g++ | |||
3 | NVCC ?= nvcc | 3 | NVCC ?= nvcc |
4 | # -fPIC is needed in all cases, as we may be linked into another shared library | 4 | # -fPIC is needed in all cases, as we may be linked into another shared library |
5 | CFLAGS = -fPIC | 5 | CFLAGS = -fPIC |
6 | LDFLAGS = -lcuda -I/usr/local/cuda/include -ldl | 6 | LDFLAGS = -lcuda -I/usr/local/cuda/include |
7 | 7 | ||
8 | .PHONY: clean tests | 8 | .PHONY: clean tests |
9 | 9 | ||
@@ -93,18 +93,16 @@ make tests | |||
93 | #### Known Working | 93 | #### Known Working |
94 | 94 | ||
95 | - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs | 95 | - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs |
96 | - CUDA 8.0 through 12.6 | 96 | - CUDA 6.5 through 12.6 |
97 | - `x86_64` and Jetson `aarch64` platforms | 97 | - `x86_64` and Jetson `aarch64` platforms |
98 | 98 | ||
99 | #### Known Issues | 99 | #### Known Issues |
100 | 100 | ||
101 | - `next_mask` will not override `stream_mask` on CUDA 11.0+ | ||
102 | - _As of Feb 2024, a fix for this is coming soon..._ | ||
103 | - `global_mask` and `next_mask` cannot disable TPCs with IDs above 128 | 101 | - `global_mask` and `next_mask` cannot disable TPCs with IDs above 128 |
104 | - Only relevant on GPUs with over 128 TPCs, such as the RTX 6000 Ada | 102 | - Only relevant on GPUs with over 128 TPCs, such as the RTX 6000 Ada |
105 | - Untested on H100 (compute capability 9.0) | ||
106 | - Untested on non-Jetson `aarch64` platforms | 103 | - Untested on non-Jetson `aarch64` platforms |
107 | - Untested on CUDA 11.8, 12.0, and 12.1 on Jetson `aarch64` | 104 | - Untested on CUDA 11.8, 12.0, and 12.1 on Jetson `aarch64` |
105 | - Mask bit indexes do not directly correlate to software-visible TPC/SM IDs in V4 TMD/QMDs (Hopper+; compute capability 9.0). The mask bit indexes instead appear to correspond to on-chip-units, including disabled ones; i.e. the set of pre-SM-ID-remapping and pre-floorsweeping TPCs | ||
108 | 106 | ||
109 | ## Important Limitations | 107 | ## Important Limitations |
110 | 108 | ||
diff --git a/libsmctrl.c b/libsmctrl.c index 1018e44..24a3177 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
@@ -5,31 +5,34 @@ | |||
5 | * | 5 | * |
6 | * This file implements partitioning via three different mechanisms: | 6 | * This file implements partitioning via three different mechanisms: |
7 | * - Modifying the QMD/TMD immediately prior to upload | 7 | * - Modifying the QMD/TMD immediately prior to upload |
8 | * - Changing a field in CUDA's global struct that CUDA applies to the QMD/TMD | ||
9 | * - Changing a field in CUDA's stream struct that CUDA applies to the QMD/TMD | 8 | * - Changing a field in CUDA's stream struct that CUDA applies to the QMD/TMD |
10 | * This table shows the mechanism used with each CUDA version: | 9 | * This table shows the mechanism used with each CUDA version: |
11 | * +-----------+---------------+---------------+--------------+ | 10 | * +-----------+---------------+---------------+--------------+ |
12 | * | Version | Global Mask | Stream Mask | Next Mask | | 11 | * | Version | Global Mask | Stream Mask | Next Mask | |
13 | * +-----------+---------------+---------------+--------------+ | 12 | * +-----------+---------------+---------------+--------------+ |
14 | * | 11.0-12.6 | TMD/QMD Hook | stream struct | TMD/QMD Hook | | 13 | * | 8.0-12.6 | TMD/QMD Hook | stream struct | TMD/QMD Hook | |
15 | * | 10.2 | global struct | stream struct | N/A | | 14 | * | 6.5-7.5 | TMD/QMD Hook | N/A | TMD/QMD Hook | |
16 | * | 8.0-10.1 | N/A | stream struct | N/A | | ||
17 | * +-----------+---------------+---------------+--------------+ | 15 | * +-----------+---------------+---------------+--------------+ |
18 | * "N/A" indicates that a mask type is unsupported on that CUDA version. | 16 | * "N/A" indicates that a mask type is unsupported on that CUDA version. |
19 | * Please contact the authors if support is needed for a particular feature on | 17 | * Please contact the authors if support is needed for a particular feature on |
20 | * an older CUDA version. Support for those is unimplemented, not impossible. | 18 | * an older CUDA version. Support for those is unimplemented, not impossible. |
19 | * | ||
20 | * An old implementation of this file effected the global mask on CUDA 10.2 by | ||
21 | * changing a field in CUDA's global struct that CUDA applies to the QMD/TMD. | ||
22 | * That implementation was extraordinarily complicated, and was replaced in | ||
23 | * 2024 with a more-backward-compatible way of hooking the TMD/QMD. | ||
24 | * View the old implementation via Git: `git show aa63a02e:libsmctrl.c`. | ||
21 | */ | 25 | */ |
22 | #include <cuda.h> | 26 | #include <cuda.h> |
23 | 27 | ||
24 | #include <errno.h> | 28 | #include <errno.h> |
25 | #include <error.h> | 29 | #include <error.h> |
26 | #include <fcntl.h> | 30 | #include <fcntl.h> |
31 | #include <stdbool.h> | ||
27 | #include <stdint.h> | 32 | #include <stdint.h> |
28 | #include <stdio.h> | 33 | #include <stdio.h> |
29 | #include <unistd.h> | 34 | #include <unistd.h> |
30 | 35 | ||
31 | #include <dlfcn.h> | ||
32 | |||
33 | #include "libsmctrl.h" | 36 | #include "libsmctrl.h" |
34 | 37 | ||
35 | // In functions that do not return an error code, we favor terminating with an | 38 | // In functions that do not return an error code, we favor terminating with an |
@@ -37,148 +40,66 @@ | |||
37 | #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ | 40 | #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ |
38 | __VA_ARGS__) | 41 | __VA_ARGS__) |
39 | 42 | ||
40 | // Layout of mask control fields to match CUDA's static global struct | 43 | /*** QMD/TMD-based SM Mask Control via Debug Callback. ***/ |
41 | struct global_sm_control { | ||
42 | uint32_t enabled; | ||
43 | uint64_t mask; | ||
44 | } __attribute__((packed)); | ||
45 | |||
46 | /*** CUDA Globals Manipulation. CUDA 10.2 only ***/ | ||
47 | |||
48 | // Ends up being 0x7fb7fa3408 in some binaries (CUDA 10.2, Jetson) | ||
49 | static struct global_sm_control* g_sm_control = NULL; | ||
50 | |||
51 | /* Find the location of CUDA's `globals` struct and the SM mask control fields | ||
52 | * No symbols are exported from within `globals`, so this has to do a very | ||
53 | * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`. | ||
54 | * Don't call this before the CUDA library has been initialized. | ||
55 | * (Note that this appears to work, even if built on CUDA > 10.2.) | ||
56 | */ | ||
57 | static void setup_g_sm_control_10() { | ||
58 | if (g_sm_control) | ||
59 | return; | ||
60 | // The location of the static global struct containing the global SM | ||
61 | // mask field will vary depending on where the loader locates the CUDA | ||
62 | // library. In order to reliably modify this struct, we must defeat | ||
63 | // that relocation by deriving its location relative to a known | ||
64 | // reference point. | ||
65 | // | ||
66 | // == Choosing a Reference Point: | ||
67 | // The cudbg* symbols appear to be relocated to a constant offset from | ||
68 | // the globals structure, and so we use the address of the symbol | ||
69 | // `cudbgReportDriverApiErrorFlags` as our reference point. (This ends | ||
70 | // up being the closest to an intermediate table we use as part of our | ||
71 | // lookup---process discussed below.) | ||
72 | // | ||
73 | // Unfortunately, the symbol we reference is errantly omitted from the | ||
74 | // libcuda.so stub used by nvcc starting around CUDA 11.8, so we have to | ||
75 | // use dlsym to avoid build-time issues. | ||
76 | void* hndl = dlopen(NULL, RTLD_LAZY); | ||
77 | uint32_t* sym = dlsym(hndl, "cudbgReportDriverApiErrorFlags"); | ||
78 | |||
79 | // == Deriving Location: | ||
80 | // The number of CUDA devices available is co-located in the same CUDA | ||
81 | // globals structure that we aim to modify the SM mask field in. The | ||
82 | // value in that field can be assigned to a user-controlled pointer via | ||
83 | // the cuDeviceGetCount() CUDA Driver Library function. To determine | ||
84 | // the location of thu structure, we pass a bad address to the function | ||
85 | // and dissasemble the code adjacent to where it segfaults. On the | ||
86 | // Jetson Xavier with CUDA 10.2, the assembly is as follows: | ||
87 | // (reg x19 contains cuDeviceGetCount()'s user-provided pointer) | ||
88 | // ... | ||
89 | // 0x0000007fb71454b4: cbz x19, 0x7fb71454d0 // Check ptr non-zero | ||
90 | // 0x0000007fb71454b8: adrp x1, 0x7fb7ea6000 // Addr of lookup tbl | ||
91 | // 0x0000007fb71454bc: ldr x1, [x1,#3672] // Get addr of globals | ||
92 | // 0x0000007fb71454c0: ldr w1, [x1,#904] // Get count from globals | ||
93 | // 0x0000007fb71454c4: str w1, [x19] // Store count at user addr | ||
94 | // ... | ||
95 | // In this assembly, we can identify that CUDA uses an internal lookup | ||
96 | // table to identify the location of the globals structure (pointer | ||
97 | // 459 in the table; offset 3672). After obtaining this pointer, it | ||
98 | // advances to offset 904 in the global structure, dereferences the | ||
99 | // value stored there, and then attempts to store it at the user- | ||
100 | // -provided address (register x19). This final line will trigger a | ||
101 | // segfault if a non-zero bad address is passed to cuDeviceGetCount(). | ||
102 | // | ||
103 | // On x86_64: | ||
104 | // (reg %rbx contains cuDeviceGetCount()'s user-provided pointer) | ||
105 | // ... | ||
106 | // 0x00007ffff6cac01f: test %rbx,%rbx // Check ptr non-zero | ||
107 | // 0x00007ffff6cac022: je 0x7ffff6cac038 // '' | ||
108 | // 0x00007ffff6cac024: mov 0x100451d(%rip),%rdx # 0x7ffff7cb0548 // Get globals base address from offset from instruction pointer | ||
109 | // 0x00007ffff6cac02b: mov 0x308(%rdx),%edx // Take globals base address, add an offset of 776, and dereference | ||
110 | // 0x00007ffff6cac031: mov %edx,(%rbx) // Store count at user addr | ||
111 | // ... | ||
112 | // Note that this does not use an intermediate lookup table. | ||
113 | // | ||
114 | // [Aside: cudbgReportDriverApiErrorFlags is currently the closest | ||
115 | // symbol to **the lookup table**. cudbgDebuggerInitialized is closer | ||
116 | // to the globals struct itself (+7424 == SM mask control), but we | ||
117 | // perfer the table lookup approach for now, as that's what | ||
118 | // cuDeviceGetCount() does.] | ||
119 | |||
120 | #if __aarch64__ | ||
121 | // In my test binary, the lookup table is at address 0x7fb7ea6000, and | ||
122 | // this is 1029868 bytes before the address for | ||
123 | // cudbgReportDriverApiErrorFlags. Use this information to derive the | ||
124 | // location of the lookup in our binary (defeat relocation). | ||
125 | uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868); | ||
126 | // Address of `globals` is at offset 3672 (entry 459?) in the table | ||
127 | uintptr_t globals_addr = *(tbl_base + 459); | ||
128 | // SM mask control is at offset 4888 in the `globals` struct | ||
129 | // [Device count at offset 904 (0x388)] | ||
130 | g_sm_control = (struct global_sm_control*)(globals_addr + 4888); | ||
131 | #endif // __aarch64__ | ||
132 | #if __x86_64__ | ||
133 | // In my test binary, globals is at 0x7ffff7cb0548, which is 1103576 | ||
134 | // bytes before the address for cudbgReportDriverApiErrorFlags | ||
135 | // (0x7ffff7dbdc20). Use this offset to defeat relocation. | ||
136 | uintptr_t globals_addr = *(uintptr_t*)((uintptr_t)sym - 1103576); | ||
137 | // SM mask control is at offset 4728 in the `globals` struct | ||
138 | // [Device count at offset 776 (0x308)] | ||
139 | g_sm_control = (struct global_sm_control*)(globals_addr + 4728); | ||
140 | #endif // __x86_64__ | ||
141 | // SM mask should be empty by default | ||
142 | if (g_sm_control->enabled || g_sm_control->mask) | ||
143 | fprintf(stderr, "Warning: Found non-empty SM disable mask " | ||
144 | "during setup! libsmctrl_set_global_mask() is " | ||
145 | "unlikely to work on this platform!\n"); | ||
146 | } | ||
147 | |||
148 | /*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ | ||
149 | 44 | ||
150 | // Tested working on CUDA x86_64 11.0-12.2. | 45 | // Tested working on x86_64 CUDA 6.5, 9.1, and various 10+ versions |
151 | // Tested not working on aarch64 or x86_64 10.2 | 46 | // (No testing attempted on pre-CUDA-6.5 versions) |
47 | // Values for the following three lines can be extracted by tracing CUPTI as | ||
48 | // it interects with libcuda.so to set callbacks. | ||
152 | static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; | 49 | static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; |
153 | #define LAUNCH_DOMAIN 0x3 | 50 | // These callback descriptors appear to intercept the TMD/QMD late enough that |
154 | #define LAUNCH_PRE_UPLOAD 0x3 | 51 | // CUDA has already applied the per-stream mask from its internal data |
52 | // structures, allowing us to override it with the next mask. | ||
53 | #define QMD_DOMAIN 0xb | ||
54 | #define QMD_PRE_UPLOAD 0x1 | ||
55 | // Global mask (applies across all threads) | ||
155 | static uint64_t g_sm_mask = 0; | 56 | static uint64_t g_sm_mask = 0; |
57 | // Next mask (applies per-thread) | ||
156 | static __thread uint64_t g_next_sm_mask = 0; | 58 | static __thread uint64_t g_next_sm_mask = 0; |
157 | static char sm_control_setup_called = 0; | 59 | // Flag value to indicate if setup has been completed |
158 | static void launchCallback(void *ukwn, int domain, int cbid, const void *in_params) { | 60 | static bool sm_control_setup_called = false; |
159 | // The third 8-byte element in `in_parms` is a pointer to the stream struct. | 61 | |
160 | // This exists even when in_params < 0x50. This could be used to implement | 62 | // v1 has been removed---it intercepted the TMD/QMD too early, making it |
161 | // stream masking without the manual offsets specified elsewhere (store a | 63 | // impossible to override the CUDA-injected stream mask with the next mask. |
162 | // table of stream pointers to masks and do a lookup here). | 64 | static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in_params) { |
163 | // It could also be used (although not as easily) to support global and next | 65 | // ***Only tested on platforms with 64-bit pointers.*** |
164 | // masking on old CUDA versions, but that would require hooking earlier in the | 66 | // The first 8-byte element in `in_params` appears to be its size. `in_params` |
165 | // launch process (before the stream mask is applied). | 67 | // must have at least five 8-byte elements for index four to be valid. |
166 | if (*(uint32_t*)in_params < 0x50) | 68 | if (*(uint32_t*)in_params < 5 * sizeof(void*)) |
167 | abort(1, 0, "Unsupported CUDA version for callback-based SM masking. Aborting..."); | 69 | abort(1, 0, "Unsupported CUDA version for callback-based SM masking. Aborting..."); |
168 | // The eighth 8-byte element in `in_params` is a pointer to a struct which | 70 | // The fourth 8-byte element in `in_params` is a pointer to the TMD. Note |
169 | // contains a pointer to the TMD as its first element. Note that this eighth | 71 | // that this fourth pointer must exist---it only exists when the first |
170 | // pointer must exist---it only exists when the first 8-byte element of | 72 | // 8-byte element of `in_params` is at least 0x28 (checked above). |
171 | // `in_params` is at least 0x50 (checked above). | 73 | void* tmd = *((void**)in_params + 4); |
172 | void* tmd = **((uintptr_t***)in_params + 8); | ||
173 | if (!tmd) | 74 | if (!tmd) |
174 | abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n"); | 75 | abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n"); |
175 | 76 | ||
176 | //fprintf(stderr, "cta: %lx\n", *(uint64_t*)(tmd + 74)); | 77 | uint32_t *lower_ptr, *upper_ptr; |
177 | // TODO: Check for supported QMD version (>XXX, <4.00) | 78 | |
178 | // TODO: Support QMD version 4 (Hopper), where offset starts at +304 (rather than +84) and is 16 bytes (rather than 8 bytes) wide. It also requires an enable bit at +31bits. | 79 | // The location of the TMD version field seems consistent across versions |
179 | uint32_t *lower_ptr = tmd + 84; | 80 | uint8_t tmd_ver = *(uint8_t*)(tmd + 72); |
180 | uint32_t *upper_ptr = tmd + 88; | 81 | |
82 | if (tmd_ver >= 0x40) { | ||
83 | // TMD V04_00 is used starting with Hopper to support masking >64 TPCs | ||
84 | lower_ptr = tmd + 304; | ||
85 | upper_ptr = tmd + 308; | ||
86 | // XXX: Disable upper 64 TPCs until we have ...next_mask_ext and | ||
87 | // ...global_mask_ext | ||
88 | *(uint32_t*)(tmd + 312) = -1; | ||
89 | *(uint32_t*)(tmd + 316) = -1; | ||
90 | // An enable bit is also required | ||
91 | *(uint32_t*)tmd |= 0x80000000; | ||
92 | } else if (tmd_ver >= 0x16) { | ||
93 | // TMD V01_06 is used starting with Kepler V2, and is the first to | ||
94 | // support TPC masking | ||
95 | lower_ptr = tmd + 84; | ||
96 | upper_ptr = tmd + 88; | ||
97 | } else { | ||
98 | // TMD V00_06 is documented to not support SM masking | ||
99 | abort(1, 0, "TMD version %04o is too old! This GPU does not support SM masking.\n", tmd_ver); | ||
100 | } | ||
181 | 101 | ||
102 | // Setting the next mask overrides both per-stream and global masks | ||
182 | if (g_next_sm_mask) { | 103 | if (g_next_sm_mask) { |
183 | *lower_ptr = (uint32_t)g_next_sm_mask; | 104 | *lower_ptr = (uint32_t)g_next_sm_mask; |
184 | *upper_ptr = (uint32_t)(g_next_sm_mask >> 32); | 105 | *upper_ptr = (uint32_t)(g_next_sm_mask >> 32); |
@@ -188,11 +109,12 @@ static void launchCallback(void *ukwn, int domain, int cbid, const void *in_para | |||
188 | *lower_ptr = (uint32_t)g_sm_mask; | 109 | *lower_ptr = (uint32_t)g_sm_mask; |
189 | *upper_ptr = (uint32_t)(g_sm_mask >> 32); | 110 | *upper_ptr = (uint32_t)(g_sm_mask >> 32); |
190 | } | 111 | } |
191 | //fprintf(stderr, "lower mask: %x\n", *lower_ptr); | 112 | |
192 | //fprintf(stderr, "upper mask: %x\n", *upper_ptr); | 113 | //fprintf(stderr, "Final SM Mask (lower): %x\n", *lower_ptr); |
114 | //fprintf(stderr, "Final SM Mask (upper): %x\n", *upper_ptr); | ||
193 | } | 115 | } |
194 | 116 | ||
195 | static void setup_sm_control_11() { | 117 | static void setup_sm_control_callback() { |
196 | int (*subscribe)(uint32_t* hndl, void(*callback)(void*, int, int, const void*), void* ukwn); | 118 | int (*subscribe)(uint32_t* hndl, void(*callback)(void*, int, int, const void*), void* ukwn); |
197 | int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid); | 119 | int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid); |
198 | uintptr_t* tbl_base; | 120 | uintptr_t* tbl_base; |
@@ -207,38 +129,41 @@ static void setup_sm_control_11() { | |||
207 | subscribe = (typeof(subscribe))subscribe_func_addr; | 129 | subscribe = (typeof(subscribe))subscribe_func_addr; |
208 | enable = (typeof(enable))enable_func_addr; | 130 | enable = (typeof(enable))enable_func_addr; |
209 | int res = 0; | 131 | int res = 0; |
210 | res = subscribe(&my_hndl, launchCallback, NULL); | 132 | res = subscribe(&my_hndl, control_callback_v2, NULL); |
211 | if (res) | 133 | if (res) |
212 | abort(1, 0, "Error subscribing to launch callback. CUDA returned error code %d.", res); | 134 | abort(1, 0, "Error subscribing to launch callback. CUDA returned error code %d.", res); |
213 | res = enable(1, my_hndl, LAUNCH_DOMAIN, LAUNCH_PRE_UPLOAD); | 135 | res = enable(1, my_hndl, QMD_DOMAIN, QMD_PRE_UPLOAD); |
214 | if (res) | 136 | if (res) |
215 | abort(1, 0, "Error enabling launch callback. CUDA returned error code %d.", res); | 137 | abort(1, 0, "Error enabling launch callback. CUDA returned error code %d.", res); |
216 | } | 138 | } |
217 | 139 | ||
218 | // Set default mask for all launches | 140 | // Set default mask for all launches |
219 | void libsmctrl_set_global_mask(uint64_t mask) { | 141 | void libsmctrl_set_global_mask(uint64_t mask) { |
220 | int ver; | 142 | if (!sm_control_setup_called) { |
221 | cuDriverGetVersion(&ver); | 143 | // The version will not change while running, so only check once |
222 | if (ver == 10020) { | 144 | int ver = 0; |
223 | if (!g_sm_control) | 145 | cuDriverGetVersion(&ver); |
224 | setup_g_sm_control_10(); | 146 | if (ver < 6050) { |
225 | g_sm_control->mask = mask; | 147 | abort(1, ENOSYS, "Global masking requires at least CUDA 6.5; " |
226 | g_sm_control->enabled = 1; | 148 | "this application is using CUDA %d.%d", |
227 | } else if (ver > 10020) { | 149 | ver / 1000, (ver % 100)); |
228 | if (!sm_control_setup_called) | 150 | setup_sm_control_callback(); |
229 | setup_sm_control_11(); | ||
230 | g_sm_mask = mask; | ||
231 | } else { // < CUDA 10.2 | ||
232 | abort(1, ENOSYS, "Global masking requires at least CUDA 10.2; " | ||
233 | "this application is using CUDA %d.%d", | ||
234 | ver / 1000, (ver % 100)); | ||
235 | } | 151 | } |
152 | g_sm_mask = mask; | ||
236 | } | 153 | } |
237 | 154 | ||
238 | // Set mask for next launch from this thread | 155 | // Set mask for next launch from this thread |
239 | void libsmctrl_set_next_mask(uint64_t mask) { | 156 | void libsmctrl_set_next_mask(uint64_t mask) { |
240 | if (!sm_control_setup_called) | 157 | if (!sm_control_setup_called) { |
241 | setup_sm_control_11(); | 158 | // The version will not change while running, so only check once |
159 | int ver = 0; | ||
160 | cuDriverGetVersion(&ver); | ||
161 | if (ver < 6050) { | ||
162 | abort(1, ENOSYS, "Next masking requires at least CUDA 6.5; " | ||
163 | "this application is using CUDA %d.%d", | ||
164 | ver / 1000, (ver % 100)); | ||
165 | setup_sm_control_callback(); | ||
166 | } | ||
242 | g_next_sm_mask = mask; | 167 | g_next_sm_mask = mask; |
243 | } | 168 | } |
244 | 169 | ||
diff --git a/libsmctrl.h b/libsmctrl.h index 6285de6..b85c0c7 100644 --- a/libsmctrl.h +++ b/libsmctrl.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /** | 1 | /** |
2 | * Copyright 2024 Joshua Bakita | 2 | * Copyright 2022-2024 Joshua Bakita |
3 | * Library to control TPC masks on CUDA launches. Co-opts preexisting debug | 3 | * Library to control TPC masks on CUDA launches. Co-opts preexisting debug |
4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. | 4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. |
5 | */ | 5 | */ |
@@ -15,7 +15,7 @@ typedef unsigned __int128 uint128_t; | |||
15 | 15 | ||
16 | // Set global default TPC mask for all kernels, incl. CUDA-internal ones | 16 | // Set global default TPC mask for all kernels, incl. CUDA-internal ones |
17 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 17 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
18 | // Supported: CUDA 10.2, and CUDA 11.0 - CUDA 12.1 | 18 | // Supported: CUDA 6.5 - CUDA 12.6 |
19 | extern void libsmctrl_set_global_mask(uint64_t mask); | 19 | extern void libsmctrl_set_global_mask(uint64_t mask); |
20 | // Set default TPC mask for all kernels launched via `stream` | 20 | // Set default TPC mask for all kernels launched via `stream` |
21 | // (overrides global mask) | 21 | // (overrides global mask) |
@@ -27,7 +27,7 @@ extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); | |||
27 | // Set TPC mask for the next kernel launch from the caller's CPU thread | 27 | // Set TPC mask for the next kernel launch from the caller's CPU thread |
28 | // (overrides global and per-stream masks, applies only to next launch). | 28 | // (overrides global and per-stream masks, applies only to next launch). |
29 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 29 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
30 | // Supported: CUDA 11.0 - CUDA 12.1 | 30 | // Supported: CUDA 6.5 - CUDA 12.6 |
31 | extern void libsmctrl_set_next_mask(uint64_t mask); | 31 | extern void libsmctrl_set_next_mask(uint64_t mask); |
32 | 32 | ||
33 | /** | 33 | /** |