diff options
author | Joshua Bakita <jbakita@cs.unc.edu> | 2024-12-19 13:36:40 -0500 |
---|---|---|
committer | Joshua Bakita <jbakita@cs.unc.edu> | 2024-12-19 13:36:40 -0500 |
commit | aa63a02efa5fc8701f0c3418704bbbc2051c1042 (patch) | |
tree | 030a0ef1136b9a955e4d749336417bf453b9b734 | |
parent | 147c69f31f25c3dc79b7943a0c56c171fe306682 (diff) |
Support CUDA 12.2, 12.5, and 12.6 on Jetson aarch64
Also test and note that stream masking on CUDA 6.5 seems impossible.
-rw-r--r-- | README.md | 7 | ||||
-rw-r--r-- | libsmctrl.c | 33 |
2 files changed, 32 insertions, 8 deletions
@@ -17,7 +17,7 @@ Please cite this paper in any work which leverages our library. Here's the BibTe | |||
17 | } | 17 | } |
18 | ``` | 18 | ``` |
19 | 19 | ||
20 | Please see [the paper](https://www.cs.unc.edu/~jbakita/rtas23.pdf) and libsmctrl.h for details and examples of how to use this library. | 20 | Please see [the paper](https://www.cs.unc.edu/~jbakita/rtas23.pdf) and `libsmctrl.h` for details and examples of how to use this library. |
21 | We strongly encourage consulting those resources first; the below comments serve merely as an appendum. | 21 | We strongly encourage consulting those resources first; the below comments serve merely as an appendum. |
22 | 22 | ||
23 | ## Run-time Dependencies | 23 | ## Run-time Dependencies |
@@ -104,6 +104,7 @@ make tests | |||
104 | - Only relevant on GPUs with over 128 TPCs, such as the RTX 6000 Ada | 104 | - Only relevant on GPUs with over 128 TPCs, such as the RTX 6000 Ada |
105 | - Untested on H100 (compute capability 9.0) | 105 | - Untested on H100 (compute capability 9.0) |
106 | - Untested on non-Jetson `aarch64` platforms | 106 | - Untested on non-Jetson `aarch64` platforms |
107 | - Untested on CUDA 11.8, 12.0, and 12.1 on Jetson `aarch64` | ||
107 | 108 | ||
108 | ## Important Limitations | 109 | ## Important Limitations |
109 | 110 | ||
@@ -113,7 +114,7 @@ make tests | |||
113 | 2. No aspect of this system prevents implicit synchronization on the GPU. | 114 | 2. No aspect of this system prevents implicit synchronization on the GPU. |
114 | See prior work, particularly that of Amert et al. (perhaps the CUPiD^RT paper), for ways to avoid this. | 115 | See prior work, particularly that of Amert et al. (perhaps the CUPiD^RT paper), for ways to avoid this. |
115 | 116 | ||
116 | ## Porting to New Architectures | 117 | ## Porting Stream Masking to Newer CUDA Versions |
117 | 118 | ||
118 | Build the tests with `make tests`. And then run the following: | 119 | Build the tests with `make tests`. And then run the following: |
119 | ``` | 120 | ``` |
@@ -124,7 +125,7 @@ How this works: | |||
124 | 125 | ||
125 | 1. If `MASK_OFF` is set, `libsmctrl` applies this as a byte offset to a base address for the location | 126 | 1. If `MASK_OFF` is set, `libsmctrl` applies this as a byte offset to a base address for the location |
126 | of the SM mask fields in CUDA's stream data structure. | 127 | of the SM mask fields in CUDA's stream data structure. |
127 | - That base address is the one for CUDA 12.2 at time of writing | 128 | - That base address is the one for CUDA 12.2 at time of writing. |
128 | 2. The stream masking test is run. | 129 | 2. The stream masking test is run. |
129 | 3. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats. | 130 | 3. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats. |
130 | 131 | ||
diff --git a/libsmctrl.c b/libsmctrl.c index 30edb32..1018e44 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /** | 1 | /** |
2 | * Copyright 2023 Joshua Bakita | 2 | * Copyright 2022-2024 Joshua Bakita |
3 | * Library to control SM masks on CUDA launches. Co-opts preexisting debug | 3 | * Library to control SM masks on CUDA launches. Co-opts preexisting debug |
4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. | 4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. |
5 | * | 5 | * |
@@ -246,6 +246,8 @@ void libsmctrl_set_next_mask(uint64_t mask) { | |||
246 | /*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/ | 246 | /*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/ |
247 | 247 | ||
248 | // Offsets for the stream struct on x86_64 | 248 | // Offsets for the stream struct on x86_64 |
249 | // No offset appears to work with CUDA 6.5 (tried 0x0--0x1b4 w/ 4-byte step) | ||
250 | // 6.5 tested on 340.118 | ||
249 | #define CU_8_0_MASK_OFF 0xec | 251 | #define CU_8_0_MASK_OFF 0xec |
250 | #define CU_9_0_MASK_OFF 0x130 | 252 | #define CU_9_0_MASK_OFF 0x130 |
251 | // CUDA 9.0 and 9.1 use the same offset | 253 | // CUDA 9.0 and 9.1 use the same offset |
@@ -274,14 +276,26 @@ void libsmctrl_set_next_mask(uint64_t mask) { | |||
274 | #define CU_12_4_MASK_OFF 0x4ac | 276 | #define CU_12_4_MASK_OFF 0x4ac |
275 | // 12.4 tested on 550.54.14 and 550.54.15 | 277 | // 12.4 tested on 550.54.14 and 550.54.15 |
276 | #define CU_12_5_MASK_OFF 0x4ec | 278 | #define CU_12_5_MASK_OFF 0x4ec |
279 | // CUDA 12.5 and 12.6 use the same offset | ||
277 | // 12.5 tested on 555.58.02 | 280 | // 12.5 tested on 555.58.02 |
278 | // 12.6 tested on 560.35.03 | 281 | // 12.6 tested on 560.35.03 |
279 | 282 | ||
280 | // Offsets for the stream struct on Jetson aarch64 | 283 | // Offsets for the stream struct on Jetson aarch64 |
281 | #define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 (Nov 2023) | 284 | #define CU_9_0_MASK_OFF_JETSON 0x128 |
282 | #define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier (Nov 2023) | 285 | // 9.0 tested on Jetpack 3.x (TX2, Nov 2023) |
283 | #define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin (Nov 2023) | 286 | #define CU_10_2_MASK_OFF_JETSON 0x24c |
284 | #define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin (Nov 2024) | 287 | // 10.2 tested on Jetpack 4.x (AGX Xaver and TX2, Nov 2023) |
288 | #define CU_11_4_MASK_OFF_JETSON 0x394 | ||
289 | // 11.4 tested on Jetpack 5.x (AGX Orin, Nov 2023) | ||
290 | // TODO: 11.8, 12.0, 12.1, and 12.2 on Jetpack 5.x via compatibility packages | ||
291 | #define CU_12_2_MASK_OFF_JETSON 0x50c | ||
292 | // 12.2 tested on Jetpack 6.x (AGX Orin, Dec 2024) | ||
293 | #define CU_12_4_MASK_OFF_JETSON 0x4c4 | ||
294 | // 12.4 tested on Jetpack 6.x with cuda-compat-12-4 (AGX Orin, Dec 2024) | ||
295 | #define CU_12_5_MASK_OFF_JETSON 0x50c | ||
296 | // 12.5 tested on Jetpack 6.x with cuda-compat-12-5 (AGX Orin, Dec 2024) | ||
297 | #define CU_12_6_MASK_OFF_JETSON 0x514 | ||
298 | // 12.6 tested on Jetpack 6.x with cuda-compat-12-6 (AGX Orin, Dec 2024) | ||
285 | 299 | ||
286 | // Used up through CUDA 11.8 in the stream struct | 300 | // Used up through CUDA 11.8 in the stream struct |
287 | struct stream_sm_mask { | 301 | struct stream_sm_mask { |
@@ -420,6 +434,15 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) { | |||
420 | case 11040: | 434 | case 11040: |
421 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON); | 435 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON); |
422 | break; | 436 | break; |
437 | case 12020: | ||
438 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF_JETSON); | ||
439 | break; | ||
440 | case 12040: | ||
441 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF_JETSON); | ||
442 | break; | ||
443 | case 12050: | ||
444 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF_JETSON); | ||
445 | break; | ||
423 | case 12060: | 446 | case 12060: |
424 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF_JETSON); | 447 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF_JETSON); |
425 | break; | 448 | break; |