aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <jbakita@cs.unc.edu>2024-11-26 11:55:26 -0500
committerJoshua Bakita <jbakita@cs.unc.edu>2024-11-26 11:58:37 -0500
commit3f9bda39d84f168c1b9f8c26075a72574645f00f (patch)
tree26d7bc0b51138d9873fac86dbbea4067960fbaea
parentebf2f07de91d9e341acc6df25e928e87b25b958d (diff)
Support stream masking on CUDA 12.4 (x86) and 12.6 (x86, aarch64)
Credit to Nordine Feddal for testing CUDA 12.4 on 550.544.14.
-rw-r--r--README.md2
-rw-r--r--libsmctrl.c21
-rw-r--r--libsmctrl.h8
3 files changed, 26 insertions, 5 deletions
diff --git a/README.md b/README.md
index 3689ecd..0c99113 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ make tests
93#### Known Working 93#### Known Working
94 94
95- NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs 95- NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs
96- CUDA 8.1 through 12.2 96- CUDA 8.0 through 12.2, plus 12.4 and 12.6
97- `x86_64` and Jetson `aarch64` platforms 97- `x86_64` and Jetson `aarch64` platforms
98 98
99#### Known Issues 99#### Known Issues
diff --git a/libsmctrl.c b/libsmctrl.c
index 817cb5d..b10b885 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -11,7 +11,7 @@
11 * +-----------+---------------+---------------+--------------+ 11 * +-----------+---------------+---------------+--------------+
12 * | Version | Global Mask | Stream Mask | Next Mask | 12 * | Version | Global Mask | Stream Mask | Next Mask |
13 * +-----------+---------------+---------------+--------------+ 13 * +-----------+---------------+---------------+--------------+
14 * | 11.0-12.2 | TMD/QMD Hook | stream struct | TMD/QMD Hook | 14 * | 11.0-12.6 | TMD/QMD Hook | stream struct | TMD/QMD Hook |
15 * | 10.2 | global struct | stream struct | N/A | 15 * | 10.2 | global struct | stream struct | N/A |
16 * | 8.0-10.1 | N/A | stream struct | N/A | 16 * | 8.0-10.1 | N/A | stream struct | N/A |
17 * +-----------+---------------+---------------+--------------+ 17 * +-----------+---------------+---------------+--------------+
@@ -269,12 +269,19 @@ void libsmctrl_set_next_mask(uint64_t mask) {
269// 12.0 tested on 525.147.05 269// 12.0 tested on 525.147.05
270#define CU_12_2_MASK_OFF 0x4e4 270#define CU_12_2_MASK_OFF 0x4e4
271// 12.2 tested on 535.129.03 271// 12.2 tested on 535.129.03
272// CUDA 12.3 UNTESTED
273#define CU_12_4_MASK_OFF 0x4ac
274// 12.4 tested on 550.54.14 and 550.54.15
275// CUDA 12.5 UNTESTED
276#define CU_12_6_MASK_OFF 0x4ec
277// 12.6 tested on 560.35.03
272 278
273// Offsets for the stream struct on aarch64 279// Offsets for the stream struct on aarch64
274// All tested on Nov 13th, 2023 280// All tested on Nov 13th, 2023
275#define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 281#define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2
276#define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier 282#define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier
277#define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin 283#define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin
284#define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin
278 285
279// Used up through CUDA 11.8 in the stream struct 286// Used up through CUDA 11.8 in the stream struct
280struct stream_sm_mask { 287struct stream_sm_mask {
@@ -316,10 +323,11 @@ int detect_parker_soc() {
316} 323}
317#endif // __aarch64__ 324#endif // __aarch64__
318 325
319// Should work for CUDA 8.0 through 12.2 326// Should work for CUDA 8.0 through 12.2, plus 12.4 and 12.6
320// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in 327// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in
321// our header 328// our header
322void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { 329void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
330 // When the old API is used on GPUs with over 64 TPCs, disable all TPCs >64
323 uint128_t full_mask = -1; 331 uint128_t full_mask = -1;
324 full_mask <<= 64; 332 full_mask <<= 64;
325 full_mask |= mask; 333 full_mask |= mask;
@@ -377,6 +385,12 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
377 case 12020: 385 case 12020:
378 hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF); 386 hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF);
379 break; 387 break;
388 case 12040:
389 hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF);
390 break;
391 case 12060:
392 hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF);
393 break;
380#elif __aarch64__ 394#elif __aarch64__
381 case 9000: { 395 case 9000: {
382 // Jetson TX2 offset is slightly different on CUDA 9.0. 396 // Jetson TX2 offset is slightly different on CUDA 9.0.
@@ -402,6 +416,9 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
402 case 11040: 416 case 11040:
403 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON); 417 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON);
404 break; 418 break;
419 case 12060:
420 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_6_MASK_OFF_JETSON);
421 break;
405#endif 422#endif
406 } 423 }
407 424
diff --git a/libsmctrl.h b/libsmctrl.h
index a8207b4..eca1f70 100644
--- a/libsmctrl.h
+++ b/libsmctrl.h
@@ -1,5 +1,5 @@
1/** 1/**
2 * Copyright 2023 Joshua Bakita 2 * Copyright 2024 Joshua Bakita
3 * Library to control TPC masks on CUDA launches. Co-opts preexisting debug 3 * Library to control TPC masks on CUDA launches. Co-opts preexisting debug
4 * logic in the CUDA driver library, and thus requires a build with -lcuda. 4 * logic in the CUDA driver library, and thus requires a build with -lcuda.
5 */ 5 */
@@ -21,7 +21,7 @@ extern void libsmctrl_set_global_mask(uint64_t mask);
21// (overrides global mask) 21// (overrides global mask)
22// @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on 22// @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on
23// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) 23// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
24// Supported: CUDA 8.0 - CUDA 12.1 24// Supported: CUDA 8.0 - CUDA 12.2, plus 12.4 and 12.6
25extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); 25extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
26extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); 26extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask);
27// Set TPC mask for the next kernel launch from the caller's CPU thread 27// Set TPC mask for the next kernel launch from the caller's CPU thread
@@ -47,6 +47,10 @@ extern void libsmctrl_set_next_mask(uint64_t mask);
47 * 47 *
48 * Note that the bitwise inversion operator (~, as used above) is very useful, 48 * Note that the bitwise inversion operator (~, as used above) is very useful,
49 * just be sure to apply it to 64-bit integer literals only! (~0x1 != ~0x1ull) 49 * just be sure to apply it to 64-bit integer literals only! (~0x1 != ~0x1ull)
50 *
51 * On GPUs with over 64 TPCs, use the _mask_ext() functions to support 128-bit
52 * masks. If using a 64-bit mask on a GPU with more than 64 TPCs, all TPCs with
53 * IDs over 64 will be disabled.
50 */ 54 */
51 55
52/* INFORMATIONAL FUNCTIONS */ 56/* INFORMATIONAL FUNCTIONS */