diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2023-03-02 22:14:22 -0500 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2023-03-02 22:14:22 -0500 |
commit | 7db0d3088a6e25c7c64999a20267f55751571dee (patch) | |
tree | 9867d0ede3818ade3a63f942446b40d2f1446254 |
Initial reimplementation of libsmctrl as a library
- Tested working with cuda_scheduling_examiner
- Supports everything described in the accepted RTAS'23 paper
- Can be used as either a shared or staticly-linked library
- Documented in libsmctrl.h
-rw-r--r-- | .gitignore | 4 | ||||
-rw-r--r-- | Makefile | 21 | ||||
-rw-r--r-- | libsmctrl.c | 332 | ||||
-rw-r--r-- | libsmctrl.h | 62 | ||||
-rw-r--r-- | libsmctrl_test_gpc_info.c | 14 |
5 files changed, 433 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dcff266 --- /dev/null +++ b/.gitignore | |||
@@ -0,0 +1,4 @@ | |||
1 | libsmctrl.a | ||
2 | libsmctrl.o | ||
3 | libsmctrl.so | ||
4 | libsmctrl_test_gpc_info | ||
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..aa59792 --- /dev/null +++ b/Makefile | |||
@@ -0,0 +1,21 @@ | |||
1 | CC = gcc | ||
2 | # -fPIC is needed in all cases, as we may be linked into another shared library | ||
3 | CFLAGS = -fPIC | ||
4 | LDFLAGS = -lcuda -I/usr/local/cuda/include | ||
5 | |||
6 | .PHONY: clean tests | ||
7 | |||
8 | libsmctrl.so: libsmctrl.c libsmctrl.h | ||
9 | $(CC) $< -shared -o $@ $(CFLAGS) $(LDFLAGS) | ||
10 | |||
11 | libsmctrl.a: libsmctrl.c libsmctrl.h | ||
12 | $(CC) $< -c -o libsmctrl.o $(CFLAGS) $(LDFLAGS) | ||
13 | ar rcs $@ libsmctrl.o | ||
14 | |||
15 | libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c | ||
16 | $(CC) $< -o $@ -L. -lsmctrl $(LDFLAGS) | ||
17 | |||
18 | tests: libsmctrl_test_gpc_info | ||
19 | |||
20 | clean: | ||
21 | rm -f libsmctrl.so libsmctrl.a | ||
diff --git a/libsmctrl.c b/libsmctrl.c new file mode 100644 index 0000000..69b19a1 --- /dev/null +++ b/libsmctrl.c | |||
@@ -0,0 +1,332 @@ | |||
1 | /** | ||
2 | * Copyright 2022 Joshua Bakita | ||
3 | * Library to control SM masks on CUDA launches. Co-opts preexisting debug | ||
4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. | ||
5 | */ | ||
6 | |||
7 | //#include "/playpen/playpen/cuda-11.8/include/cuda.h" | ||
8 | #include <cuda.h> | ||
9 | //#include <cuda_runtime.h> | ||
10 | //#ifndef CUDA_VERSION | ||
11 | //#warning libsmctrl: CUDA driver library must be included before libsmctrl.h. | ||
12 | //#endif | ||
13 | |||
14 | #include <stdint.h> | ||
15 | #include <errno.h> | ||
16 | #include <fcntl.h> | ||
17 | #include <unistd.h> | ||
18 | #include <stdio.h> | ||
19 | |||
20 | // Layout of mask control fields in CUDA's `globals` struct | ||
21 | struct global_sm_control { | ||
22 | uint32_t enabled; | ||
23 | uint64_t mask; | ||
24 | } __attribute__((packed)); | ||
25 | |||
26 | /*** CUDA Globals Manipulation. CUDA 10.2 only ***/ | ||
27 | |||
28 | // Ends up being 0x7fb7fa3408 in some binaries | ||
29 | static struct global_sm_control* g_sm_control = NULL; | ||
30 | |||
31 | /* Find the location of CUDA's `globals` struct and the SM mask control fields | ||
32 | * No symbols are exported from within `globals`, so this has to do a very | ||
33 | * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`. | ||
34 | * Don't call this before the cuda library has been initialized. | ||
35 | */ | ||
36 | static void setup_sm_control_10() { | ||
37 | if (g_sm_control) | ||
38 | return; | ||
39 | // Defeat relocation. cudbgReportDriverApiErrorFlags is relocated by | ||
40 | // the loader, but not subject to ASLR (it's always at a constant | ||
41 | // offset in the loaded instance of libcuda.so). Our target is also at | ||
42 | // a constant offset, so we can use the address of | ||
43 | // cudbgReportDriverApiErrorFlags as a reference point. | ||
44 | // Note: cudbgReportDriverApiErrorFlags is currently the closest known | ||
45 | // symbol to **the table**. cudbgDebuggerInitialized is the closest to | ||
46 | // globals itself (+7424 == SM mask control), but we perfer the table | ||
47 | // lookup approach for now, as that's what cuDeviceGetCount() does. | ||
48 | extern uint32_t cudbgReportDriverApiErrorFlags; | ||
49 | uint32_t* sym = 0;//&cudbgReportDriverApiErrorFlags; | ||
50 | // In some binaries, the following works out to 0x7fb7ea6000, and | ||
51 | // that's what shows up in the adrp instruction in cuDeviceGetCount() | ||
52 | // in the lead-up to get globals.numDevices. Find this offset by | ||
53 | // calling cuDeviceGetCount(0xdeadbeef), catching the segfault in GDB, | ||
54 | // disassembling the prior instructions, taking the adrp constant, and | ||
55 | // subtracting the address of cudbgReportDriverApiErrorFlags from it. | ||
56 | uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868); | ||
57 | // Address of `globals` is at offset 3672 (entry 459?) | ||
58 | uintptr_t globals_addr = *(tbl_base + 459); // Offset 3672 on aarch64 | ||
59 | // SM mask control is at offset 4888 in the `globals` struct | ||
60 | g_sm_control = (struct global_sm_control*)(globals_addr + 4888); | ||
61 | // SM mask should be empty by default | ||
62 | if (g_sm_control->enabled || g_sm_control->mask) | ||
63 | fprintf(stderr, "Warning: Found non-NULL SM disable mask during setup! g_sm_control is likely invalid---use at own risk.\n"); | ||
64 | } | ||
65 | |||
66 | /*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ | ||
67 | |||
68 | // Tested working on CUDA x86_64 11.0-11.8. | ||
69 | // Tested not working on aarch64 or x86_64 10.2 | ||
70 | static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; | ||
71 | #define LAUNCH_DOMAIN 0x3 | ||
72 | #define LAUNCH_PRE_UPLOAD 0x3 | ||
73 | static uint64_t g_sm_mask = 0; | ||
74 | static __thread uint64_t g_next_sm_mask = 0; | ||
75 | static char sm_control_setup_called = 0; | ||
76 | static void launchCallback(void *ukwn, int domain, int cbid, const void *in_params) { | ||
77 | if (*(uint32_t*)in_params < 0x50) { | ||
78 | fprintf(stderr, "Unsupported CUDA version for callback-based SM masking. Aborting...\n"); | ||
79 | return; | ||
80 | } | ||
81 | if (!**((uintptr_t***)in_params+8)) { | ||
82 | fprintf(stderr, "Called with NULL halLaunchDataAllocation\n"); | ||
83 | return; | ||
84 | } | ||
85 | //fprintf(stderr, "cta: %lx\n", *(uint64_t*)(**((char***)in_params + 8) + 74)); | ||
86 | // TODO: Check for supported QMD version (>XXX, <4.00) | ||
87 | // TODO: Support QMD version 4 (Hopper), where offset starts at +304 (rather than +84) and is 72 bytes (rather than 8 bytes) wide | ||
88 | uint32_t *lower_ptr = (uint32_t*)(**((char***)in_params + 8) + 84); | ||
89 | uint32_t *upper_ptr = (uint32_t*)(**((char***)in_params + 8) + 88); | ||
90 | if (g_next_sm_mask) { | ||
91 | *lower_ptr = (uint32_t)g_next_sm_mask; | ||
92 | *upper_ptr = (uint32_t)(g_next_sm_mask >> 32); | ||
93 | g_next_sm_mask = 0; | ||
94 | } else if (!*lower_ptr && !*upper_ptr){ | ||
95 | // Only apply the global mask if a per-stream mask hasn't been set | ||
96 | *lower_ptr = (uint32_t)g_sm_mask; | ||
97 | *upper_ptr = (uint32_t)(g_sm_mask >> 32); | ||
98 | } | ||
99 | //fprintf(stderr, "lower mask: %x\n", *lower_ptr); | ||
100 | //fprintf(stderr, "upper mask: %x\n", *upper_ptr); | ||
101 | } | ||
102 | |||
103 | static void setup_sm_control_11() { | ||
104 | int (*subscribe)(uint32_t* hndl, void(*callback)(void*, int, int, const void*), void* ukwn); | ||
105 | int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid); | ||
106 | uintptr_t* tbl_base; | ||
107 | uint32_t my_hndl; | ||
108 | // Avoid race conditions (setup can only be called once) | ||
109 | if (__atomic_test_and_set(&sm_control_setup_called, __ATOMIC_SEQ_CST)) | ||
110 | return; | ||
111 | |||
112 | cuGetExportTable((const void**)&tbl_base, &callback_funcs_id); | ||
113 | uintptr_t subscribe_func_addr = *(tbl_base + 3); | ||
114 | uintptr_t enable_func_addr = *(tbl_base + 6); | ||
115 | subscribe = (typeof(subscribe))subscribe_func_addr; | ||
116 | enable = (typeof(enable))enable_func_addr; | ||
117 | int res = 0; | ||
118 | res = subscribe(&my_hndl, launchCallback, NULL); | ||
119 | if (res) { | ||
120 | fprintf(stderr, "libsmctrl: Error subscribing to launch callback. Error %d\n", res); | ||
121 | return; | ||
122 | } | ||
123 | res = enable(1, my_hndl, LAUNCH_DOMAIN, LAUNCH_PRE_UPLOAD); | ||
124 | if (res) | ||
125 | fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res); | ||
126 | } | ||
127 | |||
128 | // Common masking control | ||
129 | void libsmctrl_set_global_mask(uint64_t mask) { | ||
130 | int ver; | ||
131 | cuDriverGetVersion(&ver); | ||
132 | if (ver <= 10020) { | ||
133 | if (!g_sm_control) | ||
134 | setup_sm_control_10(); | ||
135 | g_sm_control->mask = mask; | ||
136 | g_sm_control->enabled = 1; | ||
137 | } else { | ||
138 | if (!sm_control_setup_called) | ||
139 | setup_sm_control_11(); | ||
140 | g_sm_mask = mask; | ||
141 | } | ||
142 | } | ||
143 | |||
144 | void set_sm_mask(uint64_t mask) { | ||
145 | libsmctrl_set_global_mask(mask); | ||
146 | } | ||
147 | |||
148 | // Set mask for next launch from this thread | ||
149 | void libsmctrl_set_next_mask(uint64_t mask) { | ||
150 | if (!sm_control_setup_called) | ||
151 | setup_sm_control_11(); | ||
152 | g_next_sm_mask = mask; | ||
153 | } | ||
154 | |||
155 | |||
156 | /*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/ | ||
157 | |||
158 | #define CU_8_0_MASK_OFF 0xec | ||
159 | #define CU_9_0_MASK_OFF 0x130 | ||
160 | // CUDA 9.0 and 9.1 use the same offset | ||
161 | #define CU_9_2_MASK_OFF 0x140 | ||
162 | #define CU_10_0_MASK_OFF 0x24c | ||
163 | // CUDA 10.0, 10.1 and 10.2 use the same offset | ||
164 | #define CU_11_0_MASK_OFF 0x274 | ||
165 | #define CU_11_1_MASK_OFF 0x2c4 | ||
166 | #define CU_11_2_MASK_OFF 0x37c | ||
167 | // CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset | ||
168 | #define CU_11_6_MASK_OFF 0x38c | ||
169 | #define CU_11_7_MASK_OFF 0x3c4 | ||
170 | #define CU_11_8_MASK_OFF 0x47c | ||
171 | #define CU_12_0_MASK_OFF 0x4cc | ||
172 | // CUDA 12.0 and 12.1 use the same offset | ||
173 | |||
174 | // Layout in CUDA's `stream` struct | ||
175 | struct stream_sm_mask { | ||
176 | uint32_t upper; | ||
177 | uint32_t lower; | ||
178 | } __attribute__((packed)); | ||
179 | |||
180 | // Should work for CUDA 9.1, 10.0-11.8, 12.0-12.1 | ||
181 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in | ||
182 | // our header | ||
183 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | ||
184 | char* stream_struct_base = *(char**)stream; | ||
185 | struct stream_sm_mask* hw_mask; | ||
186 | int ver; | ||
187 | cuDriverGetVersion(&ver); | ||
188 | switch (ver) { | ||
189 | case 8000: | ||
190 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); | ||
191 | case 9000: | ||
192 | case 9010: | ||
193 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); | ||
194 | break; | ||
195 | case 9020: | ||
196 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF); | ||
197 | break; | ||
198 | case 10000: | ||
199 | case 10010: | ||
200 | case 10020: | ||
201 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_0_MASK_OFF); | ||
202 | break; | ||
203 | case 11000: | ||
204 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_0_MASK_OFF); | ||
205 | break; | ||
206 | case 11010: | ||
207 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_1_MASK_OFF); | ||
208 | break; | ||
209 | case 11020: | ||
210 | case 11030: | ||
211 | case 11040: | ||
212 | case 11050: | ||
213 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_2_MASK_OFF); | ||
214 | break; | ||
215 | case 11060: | ||
216 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_6_MASK_OFF); | ||
217 | break; | ||
218 | case 11070: | ||
219 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_7_MASK_OFF); | ||
220 | break; | ||
221 | case 11080: | ||
222 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF); | ||
223 | break; | ||
224 | case 12000: | ||
225 | case 12010: | ||
226 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); | ||
227 | break; | ||
228 | default: { | ||
229 | // For experimenting to determine the right mask offset, set MASK_OFF (positive and negative numbers supported) | ||
230 | char* mask_off_str = getenv("MASK_OFF"); | ||
231 | fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); | ||
232 | if (mask_off_str) { | ||
233 | int off = atoi(mask_off_str); | ||
234 | fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 11.8 base %#x (total off: %#x)\n", off, CU_11_8_MASK_OFF, CU_11_8_MASK_OFF+off); | ||
235 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF + off); | ||
236 | } else { | ||
237 | return; | ||
238 | }} | ||
239 | } | ||
240 | |||
241 | hw_mask->upper = mask >> 32; | ||
242 | hw_mask->lower = mask; | ||
243 | } | ||
244 | |||
245 | int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { | ||
246 | int num_sms; | ||
247 | int major; | ||
248 | int minor; | ||
249 | // TODO: Use nvdebug instead of this hardcoded hack | ||
250 | cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); | ||
251 | cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); | ||
252 | cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); | ||
253 | // SM masking only works on sm_35+ | ||
254 | if (major < 3 || (major == 3 && minor < 5)) | ||
255 | return -ENOTSUP; | ||
256 | // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well | ||
257 | // as the P100, which is uniquely sm_60 | ||
258 | int sms_per_tpc; | ||
259 | if (major > 6 || (major == 6 && minor == 0)) | ||
260 | sms_per_tpc = 2; | ||
261 | else | ||
262 | sms_per_tpc = 1; | ||
263 | // It looks like there may be some upcoming weirdness (TPCs with only one SM?) | ||
264 | // with Hopper | ||
265 | if (major >= 9) | ||
266 | fprintf(stderr, "libsmctrl: WARNING, SM masking is untested on Hopper, and will likely yield incorrect results! Proceed with caution.\n"); | ||
267 | *num_tpcs = num_sms/sms_per_tpc; | ||
268 | return 0; | ||
269 | } | ||
270 | |||
271 | // Read an integer from a file in `/proc` | ||
272 | static int read_int_procfile(char* filename, uint64_t* out) { | ||
273 | char f_data[18] = {0}; | ||
274 | int fd = open(filename, O_RDONLY); | ||
275 | if (fd == -1) | ||
276 | return -errno; | ||
277 | read(fd, f_data, 18); | ||
278 | close(fd); | ||
279 | *out = strtoll(f_data, NULL, 16); | ||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | static uint64_t tpc_mask_per_gpc_per_dev[16][12]; | ||
284 | int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) { | ||
285 | uint32_t i, j, vtpc_idx = 0; | ||
286 | uint64_t gpc_mask, num_tpc_per_gpc, max_gpcs, gpc_tpc_mask; | ||
287 | int err; | ||
288 | char filename[100]; | ||
289 | *num_enabled_gpcs = 0; | ||
290 | // Maximum number of GPCs supported for this chip | ||
291 | snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev); | ||
292 | if (err = read_int_procfile(filename, &max_gpcs)) { | ||
293 | fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before using libsmctrl_get_gpc_info()\n"); | ||
294 | return err; | ||
295 | } | ||
296 | // TODO: handle arbitrary-size GPUs | ||
297 | if (dev > 16 || max_gpcs > 12) { | ||
298 | fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n"); | ||
299 | return -ERANGE; | ||
300 | } | ||
301 | // Set bit = disabled GPC | ||
302 | snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev); | ||
303 | if (err = read_int_procfile(filename, &gpc_mask)) | ||
304 | return err; | ||
305 | snprintf(filename, 100, "/proc/gpu%d/num_tpc_per_gpc", dev); | ||
306 | if (err = read_int_procfile(filename, &num_tpc_per_gpc)) | ||
307 | return err; | ||
308 | // For each enabled GPC | ||
309 | for (i = 0; i < max_gpcs; i++) { | ||
310 | // Skip this GPC if disabled | ||
311 | if ((1 << i) & gpc_mask) | ||
312 | continue; | ||
313 | (*num_enabled_gpcs)++; | ||
314 | // Get the bitstring of TPCs disabled for this GPC | ||
315 | // Set bit = disabled TPC | ||
316 | snprintf(filename, 100, "/proc/gpu%d/gpc%d_tpc_mask", dev, i); | ||
317 | if (err = read_int_procfile(filename, &gpc_tpc_mask)) | ||
318 | return err; | ||
319 | uint64_t* tpc_mask = &tpc_mask_per_gpc_per_dev[dev][*num_enabled_gpcs - 1]; | ||
320 | *tpc_mask = 0; | ||
321 | for (j = 0; j < num_tpc_per_gpc; j++) { | ||
322 | // Skip disabled TPCs | ||
323 | if ((1 << j) & gpc_tpc_mask) | ||
324 | continue; | ||
325 | *tpc_mask |= (1 << vtpc_idx); | ||
326 | vtpc_idx++; | ||
327 | } | ||
328 | } | ||
329 | *tpcs_for_gpc = tpc_mask_per_gpc_per_dev[dev]; | ||
330 | return 0; | ||
331 | } | ||
332 | |||
diff --git a/libsmctrl.h b/libsmctrl.h new file mode 100644 index 0000000..7be425d --- /dev/null +++ b/libsmctrl.h | |||
@@ -0,0 +1,62 @@ | |||
1 | /** | ||
2 | * Copyright 2022 Joshua Bakita | ||
3 | * Library to control TPC masks on CUDA launches. Co-opts preexisting debug | ||
4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. | ||
5 | */ | ||
6 | |||
7 | #ifdef __cplusplus | ||
8 | extern "C" { | ||
9 | #endif | ||
10 | |||
11 | /* PARTITIONING FUNCTIONS */ | ||
12 | |||
13 | // Set global default TPC mask for all kernels, incl. CUDA-internal ones | ||
14 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | ||
15 | // Supported: CUDA 10.2, and CUDA 11.0 - CUDA 11.8 | ||
16 | extern void libsmctrl_set_global_mask(uint64_t mask); | ||
17 | // Set default TPC mask for all kernels launched via `stream` | ||
18 | // (overrides global mask) | ||
19 | // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on | ||
20 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | ||
21 | // Supported: CUDA 8.0 - CUDA 11.8 | ||
22 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); | ||
23 | // Set TPC mask for the next kernel launch from the caller's CPU thread | ||
24 | // (overrides global and per-stream masks, applies only to next launch). | ||
25 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | ||
26 | // Supported: CUDA 11.0 - CUDA 11.8 | ||
27 | extern void libsmctrl_set_next_mask(uint64_t mask); | ||
28 | |||
29 | // **DEPRECATED**: Old name for libsmctrl_set_global_mask() | ||
30 | extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_set_global_mask()"))); | ||
31 | |||
32 | /** | ||
33 | * Notes on Bitmasks | ||
34 | * | ||
35 | * All of the core partitioning functions take a `uint64_t mask` parameter. A | ||
36 | * set bit in the mask indicates that the respective Thread Processing Cluster | ||
37 | * (TPC) is to be __disabled__. | ||
38 | * | ||
39 | * Examples | ||
40 | * To prohibit the next kernel from using TPC 0: | ||
41 | * libsmctrl_set_next_mask(0x1); | ||
42 | * Allow kernels to only use TPC 0 by default: | ||
43 | * libsmctrl_set_global_mask(~0x1ull); | ||
44 | * Allow kernels in a stream to only use TPCs 2, 3, and 4: | ||
45 | * libsmctrl_set_stream_mask(stream, ~0b00111100ull); | ||
46 | * | ||
47 | * Note that the bitwise inversion operator (~, as used above) is very useful, | ||
48 | * just be sure to apply it to 64-bit integer literals only! (~0x1 != ~0x1ull) | ||
49 | */ | ||
50 | |||
51 | /* INFORMATIONAL FUNCTIONS */ | ||
52 | |||
53 | // Get total number of TPCs on device number `dev`. | ||
54 | extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev); | ||
55 | // Get number of GPCs for devices number `dev`, and a GPC-indexed array | ||
56 | // containing masks of which TPCs are associated with each GPC. | ||
57 | // Note that the `nvdebug` module must be loaded to use this function. | ||
58 | extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev); | ||
59 | |||
60 | #ifdef __cplusplus | ||
61 | } | ||
62 | #endif | ||
diff --git a/libsmctrl_test_gpc_info.c b/libsmctrl_test_gpc_info.c new file mode 100644 index 0000000..93bfc1e --- /dev/null +++ b/libsmctrl_test_gpc_info.c | |||
@@ -0,0 +1,14 @@ | |||
1 | #include <stdio.h> | ||
2 | #include <stdint.h> | ||
3 | #include "libsmctrl.h" | ||
4 | |||
5 | int main() { | ||
6 | uint32_t num_gpcs; | ||
7 | uint64_t* masks; | ||
8 | libsmctrl_get_gpc_info(&num_gpcs, &masks, 1); | ||
9 | printf("Num GPCs: %d\n", num_gpcs); | ||
10 | for (int i = 0; i < num_gpcs; i++) { | ||
11 | printf("Mask of TPCs associated with GPC %d: %#lx\n", i, masks[i]); | ||
12 | } | ||
13 | return 0; | ||
14 | } | ||