aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <jbakita@cs.unc.edu>2024-12-19 13:36:40 -0500
committerJoshua Bakita <jbakita@cs.unc.edu>2024-12-19 13:36:40 -0500
commitaa63a02efa5fc8701f0c3418704bbbc2051c1042 (patch)
tree030a0ef1136b9a955e4d749336417bf453b9b734
parent147c69f31f25c3dc79b7943a0c56c171fe306682 (diff)
Support CUDA 12.2, 12.5, and 12.6 on Jetson aarch64
Also test and note that stream masking on CUDA 6.5 seems impossible.
-rw-r--r--README.md7
-rw-r--r--libsmctrl.c33
2 files changed, 32 insertions, 8 deletions
diff --git a/README.md b/README.md
index c27ab9b..ce32b19 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ Please cite this paper in any work which leverages our library. Here's the BibTe
17} 17}
18``` 18```
19 19
20Please see [the paper](https://www.cs.unc.edu/~jbakita/rtas23.pdf) and libsmctrl.h for details and examples of how to use this library. 20Please see [the paper](https://www.cs.unc.edu/~jbakita/rtas23.pdf) and `libsmctrl.h` for details and examples of how to use this library.
21We strongly encourage consulting those resources first; the below comments serve merely as an appendum. 21We strongly encourage consulting those resources first; the below comments serve merely as an appendum.
22 22
23## Run-time Dependencies 23## Run-time Dependencies
@@ -104,6 +104,7 @@ make tests
104 - Only relevant on GPUs with over 128 TPCs, such as the RTX 6000 Ada 104 - Only relevant on GPUs with over 128 TPCs, such as the RTX 6000 Ada
105- Untested on H100 (compute capability 9.0) 105- Untested on H100 (compute capability 9.0)
106- Untested on non-Jetson `aarch64` platforms 106- Untested on non-Jetson `aarch64` platforms
107- Untested on CUDA 11.8, 12.0, and 12.1 on Jetson `aarch64`
107 108
108## Important Limitations 109## Important Limitations
109 110
@@ -113,7 +114,7 @@ make tests
1132. No aspect of this system prevents implicit synchronization on the GPU. 1142. No aspect of this system prevents implicit synchronization on the GPU.
114 See prior work, particularly that of Amert et al. (perhaps the CUPiD^RT paper), for ways to avoid this. 115 See prior work, particularly that of Amert et al. (perhaps the CUPiD^RT paper), for ways to avoid this.
115 116
116## Porting to New Architectures 117## Porting Stream Masking to Newer CUDA Versions
117 118
118Build the tests with `make tests`. And then run the following: 119Build the tests with `make tests`. And then run the following:
119``` 120```
@@ -124,7 +125,7 @@ How this works:
124 125
1251. If `MASK_OFF` is set, `libsmctrl` applies this as a byte offset to a base address for the location 1261. If `MASK_OFF` is set, `libsmctrl` applies this as a byte offset to a base address for the location
126 of the SM mask fields in CUDA's stream data structure. 127 of the SM mask fields in CUDA's stream data structure.
127 - That base address is the one for CUDA 12.2 at time of writing 128 - That base address is the one for CUDA 12.2 at time of writing.
1282. The stream masking test is run. 1292. The stream masking test is run.
1293. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats. 1303. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats.
130 131
diff --git a/libsmctrl.c b/libsmctrl.c
index 30edb32..1018e44 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -1,5 +1,5 @@
1/** 1/**
2 * Copyright 2023 Joshua Bakita 2 * Copyright 2022-2024 Joshua Bakita
3 * Library to control SM masks on CUDA launches. Co-opts preexisting debug 3 * Library to control SM masks on CUDA launches. Co-opts preexisting debug
4 * logic in the CUDA driver library, and thus requires a build with -lcuda. 4 * logic in the CUDA driver library, and thus requires a build with -lcuda.
5 * 5 *
@@ -246,6 +246,8 @@ void libsmctrl_set_next_mask(uint64_t mask) {
246/*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/ 246/*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/
247 247
248// Offsets for the stream struct on x86_64 248// Offsets for the stream struct on x86_64
249// No offset appears to work with CUDA 6.5 (tried 0x0--0x1b4 w/ 4-byte step)
250// 6.5 tested on 340.118
249#define CU_8_0_MASK_OFF 0xec 251#define CU_8_0_MASK_OFF 0xec
250#define CU_9_0_MASK_OFF 0x130 252#define CU_9_0_MASK_OFF 0x130
251// CUDA 9.0 and 9.1 use the same offset 253// CUDA 9.0 and 9.1 use the same offset
@@ -274,14 +276,26 @@ void libsmctrl_set_next_mask(uint64_t mask) {
274#define CU_12_4_MASK_OFF 0x4ac 276#define CU_12_4_MASK_OFF 0x4ac
275// 12.4 tested on 550.54.14 and 550.54.15 277// 12.4 tested on 550.54.14 and 550.54.15
276#define CU_12_5_MASK_OFF 0x4ec 278#define CU_12_5_MASK_OFF 0x4ec
279// CUDA 12.5 and 12.6 use the same offset
277// 12.5 tested on 555.58.02 280// 12.5 tested on 555.58.02
278// 12.6 tested on 560.35.03 281// 12.6 tested on 560.35.03
279 282
280// Offsets for the stream struct on Jetson aarch64 283// Offsets for the stream struct on Jetson aarch64
281#define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 (Nov 2023) 284#define CU_9_0_MASK_OFF_JETSON 0x128
282#define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier (Nov 2023) 285// 9.0 tested on Jetpack 3.x (TX2, Nov 2023)
283#define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin (Nov 2023) 286#define CU_10_2_MASK_OFF_JETSON 0x24c
284#define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin (Nov 2024) 287// 10.2 tested on Jetpack 4.x (AGX Xaver and TX2, Nov 2023)
288#define CU_11_4_MASK_OFF_JETSON 0x394
289// 11.4 tested on Jetpack 5.x (AGX Orin, Nov 2023)
290// TODO: 11.8, 12.0, 12.1, and 12.2 on Jetpack 5.x via compatibility packages
291#define CU_12_2_MASK_OFF_JETSON 0x50c
292// 12.2 tested on Jetpack 6.x (AGX Orin, Dec 2024)
293#define CU_12_4_MASK_OFF_JETSON 0x4c4
294// 12.4 tested on Jetpack 6.x with cuda-compat-12-4 (AGX Orin, Dec 2024)
295#define CU_12_5_MASK_OFF_JETSON 0x50c
296// 12.5 tested on Jetpack 6.x with cuda-compat-12-5 (AGX Orin, Dec 2024)
297#define CU_12_6_MASK_OFF_JETSON 0x514
298// 12.6 tested on Jetpack 6.x with cuda-compat-12-6 (AGX Orin, Dec 2024)
285 299
286// Used up through CUDA 11.8 in the stream struct 300// Used up through CUDA 11.8 in the stream struct
287struct stream_sm_mask { 301struct stream_sm_mask {
@@ -420,6 +434,15 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
420 case 11040: 434 case 11040:
421 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON); 435 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON);
422 break; 436 break;
437 case 12020:
438 hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF_JETSON);
439 break;
440 case 12040:
441 hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF_JETSON);
442 break;
443 case 12050:
444 hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF_JETSON);
445 break;
423 case 12060: 446 case 12060:
424 hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF_JETSON); 447 hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF_JETSON);
425 break; 448 break;