aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <jbakita@cs.unc.edu>2024-12-19 14:20:38 -0500
committerJoshua Bakita <jbakita@cs.unc.edu>2024-12-19 14:48:21 -0500
commitd052c2df34ab41ba285f70965663e5a0832f6ac9 (patch)
tree0a761be3f62910275da8a2cad546a8902073b1e9
parentaa63a02efa5fc8701f0c3418704bbbc2051c1042 (diff)
Bugfix stream-mask override, support old CUDA, and start Hopper support
Use a different callback to intercept the TMD/QMD later in the launch pipeline. Major improvements: - Fix bug with next mask not overriding stream mask on CUDA 11.0+ - Add CUDA 6.5-10.2 support for next- and global-granularity partitioning masks on x86_64 and aarch64 Jetson - Remove libdl dependency - Partially support TMD/QMD Version 4 (Hopper) Minor improvements: - Check for sufficient CUDA version before before attempting to apply a next-granularity partitioning mask - Only check for sufficient CUDA version on the first call to `libsmctrl_set_next_mask()` or `libsmctrl_set_global_mask()`, rather than checking every time (lowers overheads) - Check that TMD version is sufficient before modifying it - Improve documentation Issues: - Partitioning mask bits have a different meaning in TMD/QMD Version 4 and require floorsweeping and remapping information to properly construct. This information will be forthcoming in future releases of libsmctrl and nvdebug.
-rw-r--r--Makefile2
-rw-r--r--README.md6
-rw-r--r--libsmctrl.c247
-rw-r--r--libsmctrl.h6
4 files changed, 92 insertions, 169 deletions
diff --git a/Makefile b/Makefile
index 0e9ee3a..0d9b9f6 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ CXX = g++
3NVCC ?= nvcc 3NVCC ?= nvcc
4# -fPIC is needed in all cases, as we may be linked into another shared library 4# -fPIC is needed in all cases, as we may be linked into another shared library
5CFLAGS = -fPIC 5CFLAGS = -fPIC
6LDFLAGS = -lcuda -I/usr/local/cuda/include -ldl 6LDFLAGS = -lcuda -I/usr/local/cuda/include
7 7
8.PHONY: clean tests 8.PHONY: clean tests
9 9
diff --git a/README.md b/README.md
index ce32b19..c3f87c4 100644
--- a/README.md
+++ b/README.md
@@ -93,18 +93,16 @@ make tests
93#### Known Working 93#### Known Working
94 94
95- NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs 95- NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs
96- CUDA 8.0 through 12.6 96- CUDA 6.5 through 12.6
97- `x86_64` and Jetson `aarch64` platforms 97- `x86_64` and Jetson `aarch64` platforms
98 98
99#### Known Issues 99#### Known Issues
100 100
101- `next_mask` will not override `stream_mask` on CUDA 11.0+
102 - _As of Feb 2024, a fix for this is coming soon..._
103- `global_mask` and `next_mask` cannot disable TPCs with IDs above 128 101- `global_mask` and `next_mask` cannot disable TPCs with IDs above 128
104 - Only relevant on GPUs with over 128 TPCs, such as the RTX 6000 Ada 102 - Only relevant on GPUs with over 128 TPCs, such as the RTX 6000 Ada
105- Untested on H100 (compute capability 9.0)
106- Untested on non-Jetson `aarch64` platforms 103- Untested on non-Jetson `aarch64` platforms
107- Untested on CUDA 11.8, 12.0, and 12.1 on Jetson `aarch64` 104- Untested on CUDA 11.8, 12.0, and 12.1 on Jetson `aarch64`
105- Mask bit indexes do not directly correlate to software-visible TPC/SM IDs in V4 TMD/QMDs (Hopper+; compute capability 9.0). The mask bit indexes instead appear to correspond to on-chip-units, including disabled ones; i.e. the set of pre-SM-ID-remapping and pre-floorsweeping TPCs
108 106
109## Important Limitations 107## Important Limitations
110 108
diff --git a/libsmctrl.c b/libsmctrl.c
index 1018e44..24a3177 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -5,31 +5,34 @@
5 * 5 *
6 * This file implements partitioning via three different mechanisms: 6 * This file implements partitioning via three different mechanisms:
7 * - Modifying the QMD/TMD immediately prior to upload 7 * - Modifying the QMD/TMD immediately prior to upload
8 * - Changing a field in CUDA's global struct that CUDA applies to the QMD/TMD
9 * - Changing a field in CUDA's stream struct that CUDA applies to the QMD/TMD 8 * - Changing a field in CUDA's stream struct that CUDA applies to the QMD/TMD
10 * This table shows the mechanism used with each CUDA version: 9 * This table shows the mechanism used with each CUDA version:
11 * +-----------+---------------+---------------+--------------+ 10 * +-----------+---------------+---------------+--------------+
12 * | Version | Global Mask | Stream Mask | Next Mask | 11 * | Version | Global Mask | Stream Mask | Next Mask |
13 * +-----------+---------------+---------------+--------------+ 12 * +-----------+---------------+---------------+--------------+
14 * | 11.0-12.6 | TMD/QMD Hook | stream struct | TMD/QMD Hook | 13 * | 8.0-12.6 | TMD/QMD Hook | stream struct | TMD/QMD Hook |
15 * | 10.2 | global struct | stream struct | N/A | 14 * | 6.5-7.5 | TMD/QMD Hook | N/A | TMD/QMD Hook |
16 * | 8.0-10.1 | N/A | stream struct | N/A |
17 * +-----------+---------------+---------------+--------------+ 15 * +-----------+---------------+---------------+--------------+
18 * "N/A" indicates that a mask type is unsupported on that CUDA version. 16 * "N/A" indicates that a mask type is unsupported on that CUDA version.
19 * Please contact the authors if support is needed for a particular feature on 17 * Please contact the authors if support is needed for a particular feature on
20 * an older CUDA version. Support for those is unimplemented, not impossible. 18 * an older CUDA version. Support for those is unimplemented, not impossible.
19 *
20 * An old implementation of this file effected the global mask on CUDA 10.2 by
21 * changing a field in CUDA's global struct that CUDA applies to the QMD/TMD.
22 * That implementation was extraordinarily complicated, and was replaced in
23 * 2024 with a more-backward-compatible way of hooking the TMD/QMD.
24 * View the old implementation via Git: `git show aa63a02e:libsmctrl.c`.
21 */ 25 */
22#include <cuda.h> 26#include <cuda.h>
23 27
24#include <errno.h> 28#include <errno.h>
25#include <error.h> 29#include <error.h>
26#include <fcntl.h> 30#include <fcntl.h>
31#include <stdbool.h>
27#include <stdint.h> 32#include <stdint.h>
28#include <stdio.h> 33#include <stdio.h>
29#include <unistd.h> 34#include <unistd.h>
30 35
31#include <dlfcn.h>
32
33#include "libsmctrl.h" 36#include "libsmctrl.h"
34 37
35// In functions that do not return an error code, we favor terminating with an 38// In functions that do not return an error code, we favor terminating with an
@@ -37,148 +40,66 @@
37#define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ 40#define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \
38 __VA_ARGS__) 41 __VA_ARGS__)
39 42
40// Layout of mask control fields to match CUDA's static global struct 43/*** QMD/TMD-based SM Mask Control via Debug Callback. ***/
41struct global_sm_control {
42 uint32_t enabled;
43 uint64_t mask;
44} __attribute__((packed));
45
46/*** CUDA Globals Manipulation. CUDA 10.2 only ***/
47
48// Ends up being 0x7fb7fa3408 in some binaries (CUDA 10.2, Jetson)
49static struct global_sm_control* g_sm_control = NULL;
50
51/* Find the location of CUDA's `globals` struct and the SM mask control fields
52 * No symbols are exported from within `globals`, so this has to do a very
53 * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`.
54 * Don't call this before the CUDA library has been initialized.
55 * (Note that this appears to work, even if built on CUDA > 10.2.)
56 */
57static void setup_g_sm_control_10() {
58 if (g_sm_control)
59 return;
60 // The location of the static global struct containing the global SM
61 // mask field will vary depending on where the loader locates the CUDA
62 // library. In order to reliably modify this struct, we must defeat
63 // that relocation by deriving its location relative to a known
64 // reference point.
65 //
66 // == Choosing a Reference Point:
67 // The cudbg* symbols appear to be relocated to a constant offset from
68 // the globals structure, and so we use the address of the symbol
69 // `cudbgReportDriverApiErrorFlags` as our reference point. (This ends
70 // up being the closest to an intermediate table we use as part of our
71 // lookup---process discussed below.)
72 //
73 // Unfortunately, the symbol we reference is errantly omitted from the
74 // libcuda.so stub used by nvcc starting around CUDA 11.8, so we have to
75 // use dlsym to avoid build-time issues.
76 void* hndl = dlopen(NULL, RTLD_LAZY);
77 uint32_t* sym = dlsym(hndl, "cudbgReportDriverApiErrorFlags");
78
79 // == Deriving Location:
80 // The number of CUDA devices available is co-located in the same CUDA
81 // globals structure that we aim to modify the SM mask field in. The
82 // value in that field can be assigned to a user-controlled pointer via
83 // the cuDeviceGetCount() CUDA Driver Library function. To determine
84 // the location of thu structure, we pass a bad address to the function
85 // and dissasemble the code adjacent to where it segfaults. On the
86 // Jetson Xavier with CUDA 10.2, the assembly is as follows:
87 // (reg x19 contains cuDeviceGetCount()'s user-provided pointer)
88 // ...
89 // 0x0000007fb71454b4: cbz x19, 0x7fb71454d0 // Check ptr non-zero
90 // 0x0000007fb71454b8: adrp x1, 0x7fb7ea6000 // Addr of lookup tbl
91 // 0x0000007fb71454bc: ldr x1, [x1,#3672] // Get addr of globals
92 // 0x0000007fb71454c0: ldr w1, [x1,#904] // Get count from globals
93 // 0x0000007fb71454c4: str w1, [x19] // Store count at user addr
94 // ...
95 // In this assembly, we can identify that CUDA uses an internal lookup
96 // table to identify the location of the globals structure (pointer
97 // 459 in the table; offset 3672). After obtaining this pointer, it
98 // advances to offset 904 in the global structure, dereferences the
99 // value stored there, and then attempts to store it at the user-
100 // -provided address (register x19). This final line will trigger a
101 // segfault if a non-zero bad address is passed to cuDeviceGetCount().
102 //
103 // On x86_64:
104 // (reg %rbx contains cuDeviceGetCount()'s user-provided pointer)
105 // ...
106 // 0x00007ffff6cac01f: test %rbx,%rbx // Check ptr non-zero
107 // 0x00007ffff6cac022: je 0x7ffff6cac038 // ''
108 // 0x00007ffff6cac024: mov 0x100451d(%rip),%rdx # 0x7ffff7cb0548 // Get globals base address from offset from instruction pointer
109 // 0x00007ffff6cac02b: mov 0x308(%rdx),%edx // Take globals base address, add an offset of 776, and dereference
110 // 0x00007ffff6cac031: mov %edx,(%rbx) // Store count at user addr
111 // ...
112 // Note that this does not use an intermediate lookup table.
113 //
114 // [Aside: cudbgReportDriverApiErrorFlags is currently the closest
115 // symbol to **the lookup table**. cudbgDebuggerInitialized is closer
116 // to the globals struct itself (+7424 == SM mask control), but we
117 // perfer the table lookup approach for now, as that's what
118 // cuDeviceGetCount() does.]
119
120#if __aarch64__
121 // In my test binary, the lookup table is at address 0x7fb7ea6000, and
122 // this is 1029868 bytes before the address for
123 // cudbgReportDriverApiErrorFlags. Use this information to derive the
124 // location of the lookup in our binary (defeat relocation).
125 uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868);
126 // Address of `globals` is at offset 3672 (entry 459?) in the table
127 uintptr_t globals_addr = *(tbl_base + 459);
128 // SM mask control is at offset 4888 in the `globals` struct
129 // [Device count at offset 904 (0x388)]
130 g_sm_control = (struct global_sm_control*)(globals_addr + 4888);
131#endif // __aarch64__
132#if __x86_64__
133 // In my test binary, globals is at 0x7ffff7cb0548, which is 1103576
134 // bytes before the address for cudbgReportDriverApiErrorFlags
135 // (0x7ffff7dbdc20). Use this offset to defeat relocation.
136 uintptr_t globals_addr = *(uintptr_t*)((uintptr_t)sym - 1103576);
137 // SM mask control is at offset 4728 in the `globals` struct
138 // [Device count at offset 776 (0x308)]
139 g_sm_control = (struct global_sm_control*)(globals_addr + 4728);
140#endif // __x86_64__
141 // SM mask should be empty by default
142 if (g_sm_control->enabled || g_sm_control->mask)
143 fprintf(stderr, "Warning: Found non-empty SM disable mask "
144 "during setup! libsmctrl_set_global_mask() is "
145 "unlikely to work on this platform!\n");
146}
147
148/*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/
149 44
150// Tested working on CUDA x86_64 11.0-12.2. 45// Tested working on x86_64 CUDA 6.5, 9.1, and various 10+ versions
151// Tested not working on aarch64 or x86_64 10.2 46// (No testing attempted on pre-CUDA-6.5 versions)
47// Values for the following three lines can be extracted by tracing CUPTI as
48// it interects with libcuda.so to set callbacks.
152static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; 49static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b};
153#define LAUNCH_DOMAIN 0x3 50// These callback descriptors appear to intercept the TMD/QMD late enough that
154#define LAUNCH_PRE_UPLOAD 0x3 51// CUDA has already applied the per-stream mask from its internal data
52// structures, allowing us to override it with the next mask.
53#define QMD_DOMAIN 0xb
54#define QMD_PRE_UPLOAD 0x1
55// Global mask (applies across all threads)
155static uint64_t g_sm_mask = 0; 56static uint64_t g_sm_mask = 0;
57// Next mask (applies per-thread)
156static __thread uint64_t g_next_sm_mask = 0; 58static __thread uint64_t g_next_sm_mask = 0;
157static char sm_control_setup_called = 0; 59// Flag value to indicate if setup has been completed
158static void launchCallback(void *ukwn, int domain, int cbid, const void *in_params) { 60static bool sm_control_setup_called = false;
159 // The third 8-byte element in `in_parms` is a pointer to the stream struct. 61
160 // This exists even when in_params < 0x50. This could be used to implement 62// v1 has been removed---it intercepted the TMD/QMD too early, making it
161 // stream masking without the manual offsets specified elsewhere (store a 63// impossible to override the CUDA-injected stream mask with the next mask.
162 // table of stream pointers to masks and do a lookup here). 64static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in_params) {
163 // It could also be used (although not as easily) to support global and next 65 // ***Only tested on platforms with 64-bit pointers.***
164 // masking on old CUDA versions, but that would require hooking earlier in the 66 // The first 8-byte element in `in_params` appears to be its size. `in_params`
165 // launch process (before the stream mask is applied). 67 // must have at least five 8-byte elements for index four to be valid.
166 if (*(uint32_t*)in_params < 0x50) 68 if (*(uint32_t*)in_params < 5 * sizeof(void*))
167 abort(1, 0, "Unsupported CUDA version for callback-based SM masking. Aborting..."); 69 abort(1, 0, "Unsupported CUDA version for callback-based SM masking. Aborting...");
168 // The eighth 8-byte element in `in_params` is a pointer to a struct which 70 // The fourth 8-byte element in `in_params` is a pointer to the TMD. Note
169 // contains a pointer to the TMD as its first element. Note that this eighth 71 // that this fourth pointer must exist---it only exists when the first
170 // pointer must exist---it only exists when the first 8-byte element of 72 // 8-byte element of `in_params` is at least 0x28 (checked above).
171 // `in_params` is at least 0x50 (checked above). 73 void* tmd = *((void**)in_params + 4);
172 void* tmd = **((uintptr_t***)in_params + 8);
173 if (!tmd) 74 if (!tmd)
174 abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n"); 75 abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n");
175 76
176 //fprintf(stderr, "cta: %lx\n", *(uint64_t*)(tmd + 74)); 77 uint32_t *lower_ptr, *upper_ptr;
177 // TODO: Check for supported QMD version (>XXX, <4.00) 78
178 // TODO: Support QMD version 4 (Hopper), where offset starts at +304 (rather than +84) and is 16 bytes (rather than 8 bytes) wide. It also requires an enable bit at +31bits. 79 // The location of the TMD version field seems consistent across versions
179 uint32_t *lower_ptr = tmd + 84; 80 uint8_t tmd_ver = *(uint8_t*)(tmd + 72);
180 uint32_t *upper_ptr = tmd + 88; 81
82 if (tmd_ver >= 0x40) {
83 // TMD V04_00 is used starting with Hopper to support masking >64 TPCs
84 lower_ptr = tmd + 304;
85 upper_ptr = tmd + 308;
86 // XXX: Disable upper 64 TPCs until we have ...next_mask_ext and
87 // ...global_mask_ext
88 *(uint32_t*)(tmd + 312) = -1;
89 *(uint32_t*)(tmd + 316) = -1;
90 // An enable bit is also required
91 *(uint32_t*)tmd |= 0x80000000;
92 } else if (tmd_ver >= 0x16) {
93 // TMD V01_06 is used starting with Kepler V2, and is the first to
94 // support TPC masking
95 lower_ptr = tmd + 84;
96 upper_ptr = tmd + 88;
97 } else {
98 // TMD V00_06 is documented to not support SM masking
99 abort(1, 0, "TMD version %04o is too old! This GPU does not support SM masking.\n", tmd_ver);
100 }
181 101
102 // Setting the next mask overrides both per-stream and global masks
182 if (g_next_sm_mask) { 103 if (g_next_sm_mask) {
183 *lower_ptr = (uint32_t)g_next_sm_mask; 104 *lower_ptr = (uint32_t)g_next_sm_mask;
184 *upper_ptr = (uint32_t)(g_next_sm_mask >> 32); 105 *upper_ptr = (uint32_t)(g_next_sm_mask >> 32);
@@ -188,11 +109,12 @@ static void launchCallback(void *ukwn, int domain, int cbid, const void *in_para
188 *lower_ptr = (uint32_t)g_sm_mask; 109 *lower_ptr = (uint32_t)g_sm_mask;
189 *upper_ptr = (uint32_t)(g_sm_mask >> 32); 110 *upper_ptr = (uint32_t)(g_sm_mask >> 32);
190 } 111 }
191 //fprintf(stderr, "lower mask: %x\n", *lower_ptr); 112
192 //fprintf(stderr, "upper mask: %x\n", *upper_ptr); 113 //fprintf(stderr, "Final SM Mask (lower): %x\n", *lower_ptr);
114 //fprintf(stderr, "Final SM Mask (upper): %x\n", *upper_ptr);
193} 115}
194 116
195static void setup_sm_control_11() { 117static void setup_sm_control_callback() {
196 int (*subscribe)(uint32_t* hndl, void(*callback)(void*, int, int, const void*), void* ukwn); 118 int (*subscribe)(uint32_t* hndl, void(*callback)(void*, int, int, const void*), void* ukwn);
197 int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid); 119 int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid);
198 uintptr_t* tbl_base; 120 uintptr_t* tbl_base;
@@ -207,38 +129,41 @@ static void setup_sm_control_11() {
207 subscribe = (typeof(subscribe))subscribe_func_addr; 129 subscribe = (typeof(subscribe))subscribe_func_addr;
208 enable = (typeof(enable))enable_func_addr; 130 enable = (typeof(enable))enable_func_addr;
209 int res = 0; 131 int res = 0;
210 res = subscribe(&my_hndl, launchCallback, NULL); 132 res = subscribe(&my_hndl, control_callback_v2, NULL);
211 if (res) 133 if (res)
212 abort(1, 0, "Error subscribing to launch callback. CUDA returned error code %d.", res); 134 abort(1, 0, "Error subscribing to launch callback. CUDA returned error code %d.", res);
213 res = enable(1, my_hndl, LAUNCH_DOMAIN, LAUNCH_PRE_UPLOAD); 135 res = enable(1, my_hndl, QMD_DOMAIN, QMD_PRE_UPLOAD);
214 if (res) 136 if (res)
215 abort(1, 0, "Error enabling launch callback. CUDA returned error code %d.", res); 137 abort(1, 0, "Error enabling launch callback. CUDA returned error code %d.", res);
216} 138}
217 139
218// Set default mask for all launches 140// Set default mask for all launches
219void libsmctrl_set_global_mask(uint64_t mask) { 141void libsmctrl_set_global_mask(uint64_t mask) {
220 int ver; 142 if (!sm_control_setup_called) {
221 cuDriverGetVersion(&ver); 143 // The version will not change while running, so only check once
222 if (ver == 10020) { 144 int ver = 0;
223 if (!g_sm_control) 145 cuDriverGetVersion(&ver);
224 setup_g_sm_control_10(); 146 if (ver < 6050) {
225 g_sm_control->mask = mask; 147 abort(1, ENOSYS, "Global masking requires at least CUDA 6.5; "
226 g_sm_control->enabled = 1; 148 "this application is using CUDA %d.%d",
227 } else if (ver > 10020) { 149 ver / 1000, (ver % 100));
228 if (!sm_control_setup_called) 150 setup_sm_control_callback();
229 setup_sm_control_11();
230 g_sm_mask = mask;
231 } else { // < CUDA 10.2
232 abort(1, ENOSYS, "Global masking requires at least CUDA 10.2; "
233 "this application is using CUDA %d.%d",
234 ver / 1000, (ver % 100));
235 } 151 }
152 g_sm_mask = mask;
236} 153}
237 154
238// Set mask for next launch from this thread 155// Set mask for next launch from this thread
239void libsmctrl_set_next_mask(uint64_t mask) { 156void libsmctrl_set_next_mask(uint64_t mask) {
240 if (!sm_control_setup_called) 157 if (!sm_control_setup_called) {
241 setup_sm_control_11(); 158 // The version will not change while running, so only check once
159 int ver = 0;
160 cuDriverGetVersion(&ver);
161 if (ver < 6050) {
162 abort(1, ENOSYS, "Next masking requires at least CUDA 6.5; "
163 "this application is using CUDA %d.%d",
164 ver / 1000, (ver % 100));
165 setup_sm_control_callback();
166 }
242 g_next_sm_mask = mask; 167 g_next_sm_mask = mask;
243} 168}
244 169
diff --git a/libsmctrl.h b/libsmctrl.h
index 6285de6..b85c0c7 100644
--- a/libsmctrl.h
+++ b/libsmctrl.h
@@ -1,5 +1,5 @@
1/** 1/**
2 * Copyright 2024 Joshua Bakita 2 * Copyright 2022-2024 Joshua Bakita
3 * Library to control TPC masks on CUDA launches. Co-opts preexisting debug 3 * Library to control TPC masks on CUDA launches. Co-opts preexisting debug
4 * logic in the CUDA driver library, and thus requires a build with -lcuda. 4 * logic in the CUDA driver library, and thus requires a build with -lcuda.
5 */ 5 */
@@ -15,7 +15,7 @@ typedef unsigned __int128 uint128_t;
15 15
16// Set global default TPC mask for all kernels, incl. CUDA-internal ones 16// Set global default TPC mask for all kernels, incl. CUDA-internal ones
17// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) 17// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
18// Supported: CUDA 10.2, and CUDA 11.0 - CUDA 12.1 18// Supported: CUDA 6.5 - CUDA 12.6
19extern void libsmctrl_set_global_mask(uint64_t mask); 19extern void libsmctrl_set_global_mask(uint64_t mask);
20// Set default TPC mask for all kernels launched via `stream` 20// Set default TPC mask for all kernels launched via `stream`
21// (overrides global mask) 21// (overrides global mask)
@@ -27,7 +27,7 @@ extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask);
27// Set TPC mask for the next kernel launch from the caller's CPU thread 27// Set TPC mask for the next kernel launch from the caller's CPU thread
28// (overrides global and per-stream masks, applies only to next launch). 28// (overrides global and per-stream masks, applies only to next launch).
29// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) 29// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
30// Supported: CUDA 11.0 - CUDA 12.1 30// Supported: CUDA 6.5 - CUDA 12.6
31extern void libsmctrl_set_next_mask(uint64_t mask); 31extern void libsmctrl_set_next_mask(uint64_t mask);
32 32
33/** 33/**