aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2023-03-02 22:14:22 -0500
committerJoshua Bakita <bakitajoshua@gmail.com>2023-03-02 22:14:22 -0500
commit7db0d3088a6e25c7c64999a20267f55751571dee (patch)
tree9867d0ede3818ade3a63f942446b40d2f1446254
Initial reimplementation of libsmctrl as a library
- Tested working with cuda_scheduling_examiner - Supports everything described in the accepted RTAS'23 paper - Can be used as either a shared or staticly-linked library - Documented in libsmctrl.h
-rw-r--r--.gitignore4
-rw-r--r--Makefile21
-rw-r--r--libsmctrl.c332
-rw-r--r--libsmctrl.h62
-rw-r--r--libsmctrl_test_gpc_info.c14
5 files changed, 433 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..dcff266
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
1libsmctrl.a
2libsmctrl.o
3libsmctrl.so
4libsmctrl_test_gpc_info
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..aa59792
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,21 @@
1CC = gcc
2# -fPIC is needed in all cases, as we may be linked into another shared library
3CFLAGS = -fPIC
4LDFLAGS = -lcuda -I/usr/local/cuda/include
5
6.PHONY: clean tests
7
8libsmctrl.so: libsmctrl.c libsmctrl.h
9 $(CC) $< -shared -o $@ $(CFLAGS) $(LDFLAGS)
10
11libsmctrl.a: libsmctrl.c libsmctrl.h
12 $(CC) $< -c -o libsmctrl.o $(CFLAGS) $(LDFLAGS)
13 ar rcs $@ libsmctrl.o
14
15libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c
16 $(CC) $< -o $@ -L. -lsmctrl $(LDFLAGS)
17
18tests: libsmctrl_test_gpc_info
19
20clean:
21 rm -f libsmctrl.so libsmctrl.a
diff --git a/libsmctrl.c b/libsmctrl.c
new file mode 100644
index 0000000..69b19a1
--- /dev/null
+++ b/libsmctrl.c
@@ -0,0 +1,332 @@
1/**
2 * Copyright 2022 Joshua Bakita
3 * Library to control SM masks on CUDA launches. Co-opts preexisting debug
4 * logic in the CUDA driver library, and thus requires a build with -lcuda.
5 */
6
7//#include "/playpen/playpen/cuda-11.8/include/cuda.h"
8#include <cuda.h>
9//#include <cuda_runtime.h>
10//#ifndef CUDA_VERSION
11//#warning libsmctrl: CUDA driver library must be included before libsmctrl.h.
12//#endif
13
14#include <stdint.h>
15#include <errno.h>
16#include <fcntl.h>
17#include <unistd.h>
18#include <stdio.h>
19
20// Layout of mask control fields in CUDA's `globals` struct
21struct global_sm_control {
22 uint32_t enabled;
23 uint64_t mask;
24} __attribute__((packed));
25
26/*** CUDA Globals Manipulation. CUDA 10.2 only ***/
27
28// Ends up being 0x7fb7fa3408 in some binaries
29static struct global_sm_control* g_sm_control = NULL;
30
31/* Find the location of CUDA's `globals` struct and the SM mask control fields
32 * No symbols are exported from within `globals`, so this has to do a very
33 * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`.
34 * Don't call this before the cuda library has been initialized.
35 */
36static void setup_sm_control_10() {
37 if (g_sm_control)
38 return;
39 // Defeat relocation. cudbgReportDriverApiErrorFlags is relocated by
40 // the loader, but not subject to ASLR (it's always at a constant
41 // offset in the loaded instance of libcuda.so). Our target is also at
42 // a constant offset, so we can use the address of
43 // cudbgReportDriverApiErrorFlags as a reference point.
44 // Note: cudbgReportDriverApiErrorFlags is currently the closest known
45 // symbol to **the table**. cudbgDebuggerInitialized is the closest to
46 // globals itself (+7424 == SM mask control), but we perfer the table
47 // lookup approach for now, as that's what cuDeviceGetCount() does.
48 extern uint32_t cudbgReportDriverApiErrorFlags;
49 uint32_t* sym = 0;//&cudbgReportDriverApiErrorFlags;
50 // In some binaries, the following works out to 0x7fb7ea6000, and
51 // that's what shows up in the adrp instruction in cuDeviceGetCount()
52 // in the lead-up to get globals.numDevices. Find this offset by
53 // calling cuDeviceGetCount(0xdeadbeef), catching the segfault in GDB,
54 // disassembling the prior instructions, taking the adrp constant, and
55 // subtracting the address of cudbgReportDriverApiErrorFlags from it.
56 uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868);
57 // Address of `globals` is at offset 3672 (entry 459?)
58 uintptr_t globals_addr = *(tbl_base + 459); // Offset 3672 on aarch64
59 // SM mask control is at offset 4888 in the `globals` struct
60 g_sm_control = (struct global_sm_control*)(globals_addr + 4888);
61 // SM mask should be empty by default
62 if (g_sm_control->enabled || g_sm_control->mask)
63 fprintf(stderr, "Warning: Found non-NULL SM disable mask during setup! g_sm_control is likely invalid---use at own risk.\n");
64}
65
66/*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/
67
68// Tested working on CUDA x86_64 11.0-11.8.
69// Tested not working on aarch64 or x86_64 10.2
70static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b};
71#define LAUNCH_DOMAIN 0x3
72#define LAUNCH_PRE_UPLOAD 0x3
73static uint64_t g_sm_mask = 0;
74static __thread uint64_t g_next_sm_mask = 0;
75static char sm_control_setup_called = 0;
76static void launchCallback(void *ukwn, int domain, int cbid, const void *in_params) {
77 if (*(uint32_t*)in_params < 0x50) {
78 fprintf(stderr, "Unsupported CUDA version for callback-based SM masking. Aborting...\n");
79 return;
80 }
81 if (!**((uintptr_t***)in_params+8)) {
82 fprintf(stderr, "Called with NULL halLaunchDataAllocation\n");
83 return;
84 }
85 //fprintf(stderr, "cta: %lx\n", *(uint64_t*)(**((char***)in_params + 8) + 74));
86 // TODO: Check for supported QMD version (>XXX, <4.00)
87 // TODO: Support QMD version 4 (Hopper), where offset starts at +304 (rather than +84) and is 72 bytes (rather than 8 bytes) wide
88 uint32_t *lower_ptr = (uint32_t*)(**((char***)in_params + 8) + 84);
89 uint32_t *upper_ptr = (uint32_t*)(**((char***)in_params + 8) + 88);
90 if (g_next_sm_mask) {
91 *lower_ptr = (uint32_t)g_next_sm_mask;
92 *upper_ptr = (uint32_t)(g_next_sm_mask >> 32);
93 g_next_sm_mask = 0;
94 } else if (!*lower_ptr && !*upper_ptr){
95 // Only apply the global mask if a per-stream mask hasn't been set
96 *lower_ptr = (uint32_t)g_sm_mask;
97 *upper_ptr = (uint32_t)(g_sm_mask >> 32);
98 }
99 //fprintf(stderr, "lower mask: %x\n", *lower_ptr);
100 //fprintf(stderr, "upper mask: %x\n", *upper_ptr);
101}
102
103static void setup_sm_control_11() {
104 int (*subscribe)(uint32_t* hndl, void(*callback)(void*, int, int, const void*), void* ukwn);
105 int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid);
106 uintptr_t* tbl_base;
107 uint32_t my_hndl;
108 // Avoid race conditions (setup can only be called once)
109 if (__atomic_test_and_set(&sm_control_setup_called, __ATOMIC_SEQ_CST))
110 return;
111
112 cuGetExportTable((const void**)&tbl_base, &callback_funcs_id);
113 uintptr_t subscribe_func_addr = *(tbl_base + 3);
114 uintptr_t enable_func_addr = *(tbl_base + 6);
115 subscribe = (typeof(subscribe))subscribe_func_addr;
116 enable = (typeof(enable))enable_func_addr;
117 int res = 0;
118 res = subscribe(&my_hndl, launchCallback, NULL);
119 if (res) {
120 fprintf(stderr, "libsmctrl: Error subscribing to launch callback. Error %d\n", res);
121 return;
122 }
123 res = enable(1, my_hndl, LAUNCH_DOMAIN, LAUNCH_PRE_UPLOAD);
124 if (res)
125 fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res);
126}
127
128// Common masking control
129void libsmctrl_set_global_mask(uint64_t mask) {
130 int ver;
131 cuDriverGetVersion(&ver);
132 if (ver <= 10020) {
133 if (!g_sm_control)
134 setup_sm_control_10();
135 g_sm_control->mask = mask;
136 g_sm_control->enabled = 1;
137 } else {
138 if (!sm_control_setup_called)
139 setup_sm_control_11();
140 g_sm_mask = mask;
141 }
142}
143
144void set_sm_mask(uint64_t mask) {
145 libsmctrl_set_global_mask(mask);
146}
147
148// Set mask for next launch from this thread
149void libsmctrl_set_next_mask(uint64_t mask) {
150 if (!sm_control_setup_called)
151 setup_sm_control_11();
152 g_next_sm_mask = mask;
153}
154
155
156/*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/
157
158#define CU_8_0_MASK_OFF 0xec
159#define CU_9_0_MASK_OFF 0x130
160// CUDA 9.0 and 9.1 use the same offset
161#define CU_9_2_MASK_OFF 0x140
162#define CU_10_0_MASK_OFF 0x24c
163// CUDA 10.0, 10.1 and 10.2 use the same offset
164#define CU_11_0_MASK_OFF 0x274
165#define CU_11_1_MASK_OFF 0x2c4
166#define CU_11_2_MASK_OFF 0x37c
167// CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset
168#define CU_11_6_MASK_OFF 0x38c
169#define CU_11_7_MASK_OFF 0x3c4
170#define CU_11_8_MASK_OFF 0x47c
171#define CU_12_0_MASK_OFF 0x4cc
172// CUDA 12.0 and 12.1 use the same offset
173
174// Layout in CUDA's `stream` struct
175struct stream_sm_mask {
176 uint32_t upper;
177 uint32_t lower;
178} __attribute__((packed));
179
180// Should work for CUDA 9.1, 10.0-11.8, 12.0-12.1
181// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in
182// our header
183void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
184 char* stream_struct_base = *(char**)stream;
185 struct stream_sm_mask* hw_mask;
186 int ver;
187 cuDriverGetVersion(&ver);
188 switch (ver) {
189 case 8000:
190 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF);
191 case 9000:
192 case 9010:
193 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF);
194 break;
195 case 9020:
196 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF);
197 break;
198 case 10000:
199 case 10010:
200 case 10020:
201 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_0_MASK_OFF);
202 break;
203 case 11000:
204 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_0_MASK_OFF);
205 break;
206 case 11010:
207 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_1_MASK_OFF);
208 break;
209 case 11020:
210 case 11030:
211 case 11040:
212 case 11050:
213 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_2_MASK_OFF);
214 break;
215 case 11060:
216 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_6_MASK_OFF);
217 break;
218 case 11070:
219 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_7_MASK_OFF);
220 break;
221 case 11080:
222 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF);
223 break;
224 case 12000:
225 case 12010:
226 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF);
227 break;
228 default: {
229 // For experimenting to determine the right mask offset, set MASK_OFF (positive and negative numbers supported)
230 char* mask_off_str = getenv("MASK_OFF");
231 fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver);
232 if (mask_off_str) {
233 int off = atoi(mask_off_str);
234 fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 11.8 base %#x (total off: %#x)\n", off, CU_11_8_MASK_OFF, CU_11_8_MASK_OFF+off);
235 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF + off);
236 } else {
237 return;
238 }}
239 }
240
241 hw_mask->upper = mask >> 32;
242 hw_mask->lower = mask;
243}
244
245int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {
246 int num_sms;
247 int major;
248 int minor;
249 // TODO: Use nvdebug instead of this hardcoded hack
250 cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
251 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
252 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
253 // SM masking only works on sm_35+
254 if (major < 3 || (major == 3 && minor < 5))
255 return -ENOTSUP;
256 // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
257 // as the P100, which is uniquely sm_60
258 int sms_per_tpc;
259 if (major > 6 || (major == 6 && minor == 0))
260 sms_per_tpc = 2;
261 else
262 sms_per_tpc = 1;
263 // It looks like there may be some upcoming weirdness (TPCs with only one SM?)
264 // with Hopper
265 if (major >= 9)
266 fprintf(stderr, "libsmctrl: WARNING, SM masking is untested on Hopper, and will likely yield incorrect results! Proceed with caution.\n");
267 *num_tpcs = num_sms/sms_per_tpc;
268 return 0;
269}
270
271// Read an integer from a file in `/proc`
272static int read_int_procfile(char* filename, uint64_t* out) {
273 char f_data[18] = {0};
274 int fd = open(filename, O_RDONLY);
275 if (fd == -1)
276 return -errno;
277 read(fd, f_data, 18);
278 close(fd);
279 *out = strtoll(f_data, NULL, 16);
280 return 0;
281}
282
283static uint64_t tpc_mask_per_gpc_per_dev[16][12];
284int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) {
285 uint32_t i, j, vtpc_idx = 0;
286 uint64_t gpc_mask, num_tpc_per_gpc, max_gpcs, gpc_tpc_mask;
287 int err;
288 char filename[100];
289 *num_enabled_gpcs = 0;
290 // Maximum number of GPCs supported for this chip
291 snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev);
292 if (err = read_int_procfile(filename, &max_gpcs)) {
293 fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before using libsmctrl_get_gpc_info()\n");
294 return err;
295 }
296 // TODO: handle arbitrary-size GPUs
297 if (dev > 16 || max_gpcs > 12) {
298 fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n");
299 return -ERANGE;
300 }
301 // Set bit = disabled GPC
302 snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev);
303 if (err = read_int_procfile(filename, &gpc_mask))
304 return err;
305 snprintf(filename, 100, "/proc/gpu%d/num_tpc_per_gpc", dev);
306 if (err = read_int_procfile(filename, &num_tpc_per_gpc))
307 return err;
308 // For each enabled GPC
309 for (i = 0; i < max_gpcs; i++) {
310 // Skip this GPC if disabled
311 if ((1 << i) & gpc_mask)
312 continue;
313 (*num_enabled_gpcs)++;
314 // Get the bitstring of TPCs disabled for this GPC
315 // Set bit = disabled TPC
316 snprintf(filename, 100, "/proc/gpu%d/gpc%d_tpc_mask", dev, i);
317 if (err = read_int_procfile(filename, &gpc_tpc_mask))
318 return err;
319 uint64_t* tpc_mask = &tpc_mask_per_gpc_per_dev[dev][*num_enabled_gpcs - 1];
320 *tpc_mask = 0;
321 for (j = 0; j < num_tpc_per_gpc; j++) {
322 // Skip disabled TPCs
323 if ((1 << j) & gpc_tpc_mask)
324 continue;
325 *tpc_mask |= (1 << vtpc_idx);
326 vtpc_idx++;
327 }
328 }
329 *tpcs_for_gpc = tpc_mask_per_gpc_per_dev[dev];
330 return 0;
331}
332
diff --git a/libsmctrl.h b/libsmctrl.h
new file mode 100644
index 0000000..7be425d
--- /dev/null
+++ b/libsmctrl.h
@@ -0,0 +1,62 @@
1/**
2 * Copyright 2022 Joshua Bakita
3 * Library to control TPC masks on CUDA launches. Co-opts preexisting debug
4 * logic in the CUDA driver library, and thus requires a build with -lcuda.
5 */
6
7#ifdef __cplusplus
8extern "C" {
9#endif
10
11/* PARTITIONING FUNCTIONS */
12
13// Set global default TPC mask for all kernels, incl. CUDA-internal ones
14// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
15// Supported: CUDA 10.2, and CUDA 11.0 - CUDA 11.8
16extern void libsmctrl_set_global_mask(uint64_t mask);
17// Set default TPC mask for all kernels launched via `stream`
18// (overrides global mask)
19// @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on
20// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
21// Supported: CUDA 8.0 - CUDA 11.8
22extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
23// Set TPC mask for the next kernel launch from the caller's CPU thread
24// (overrides global and per-stream masks, applies only to next launch).
25// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
26// Supported: CUDA 11.0 - CUDA 11.8
27extern void libsmctrl_set_next_mask(uint64_t mask);
28
29// **DEPRECATED**: Old name for libsmctrl_set_global_mask()
30extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_set_global_mask()")));
31
32/**
33 * Notes on Bitmasks
34 *
35 * All of the core partitioning functions take a `uint64_t mask` parameter. A
36 * set bit in the mask indicates that the respective Thread Processing Cluster
37 * (TPC) is to be __disabled__.
38 *
39 * Examples
40 * To prohibit the next kernel from using TPC 0:
41 * libsmctrl_set_next_mask(0x1);
42 * Allow kernels to only use TPC 0 by default:
43 * libsmctrl_set_global_mask(~0x1ull);
44 * Allow kernels in a stream to only use TPCs 2, 3, and 4:
45 * libsmctrl_set_stream_mask(stream, ~0b00111100ull);
46 *
47 * Note that the bitwise inversion operator (~, as used above) is very useful,
48 * just be sure to apply it to 64-bit integer literals only! (~0x1 != ~0x1ull)
49 */
50
51/* INFORMATIONAL FUNCTIONS */
52
53// Get total number of TPCs on device number `dev`.
54extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev);
55// Get number of GPCs for devices number `dev`, and a GPC-indexed array
56// containing masks of which TPCs are associated with each GPC.
57// Note that the `nvdebug` module must be loaded to use this function.
58extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev);
59
60#ifdef __cplusplus
61}
62#endif
diff --git a/libsmctrl_test_gpc_info.c b/libsmctrl_test_gpc_info.c
new file mode 100644
index 0000000..93bfc1e
--- /dev/null
+++ b/libsmctrl_test_gpc_info.c
@@ -0,0 +1,14 @@
1#include <stdio.h>
2#include <stdint.h>
3#include "libsmctrl.h"
4
5int main() {
6 uint32_t num_gpcs;
7 uint64_t* masks;
8 libsmctrl_get_gpc_info(&num_gpcs, &masks, 1);
9 printf("Num GPCs: %d\n", num_gpcs);
10 for (int i = 0; i < num_gpcs; i++) {
11 printf("Mask of TPCs associated with GPC %d: %#lx\n", i, masks[i]);
12 }
13 return 0;
14}