Support stream masking on CUDA 12.3 (x86) and 12.5 (x86)

author: Joshua Bakita <bakitajoshua@gmail.com> 2024-11-26 20:05:44 -0500
committer: Joshua Bakita <bakitajoshua@gmail.com> 2024-11-26 20:05:44 -0500
commit: 2ad0e819a9a9652f6afc0b6da4d70a1232c124d7 (patch)
tree: 7eb8e751257d8e58ff515d121c7e6594cc4e51bf
parent: 3f9bda39d84f168c1b9f8c26075a72574645f00f (diff)
3 files changed, 19 insertions, 13 deletions
diff --git a/README.md b/README.md
index 0c99113..c27ab9b 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ make tests
 #### Known Working
 - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs
- CUDA 8.0 through 12.2, plus 12.4 and 12.6
+- CUDA 8.0 through 12.6
 - `x86_64` and Jetson `aarch64` platforms
 #### Known Issues
@@ -129,3 +129,5 @@ How this works:
 3. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats.
 Once this loop aborts, take the found offset and add it into the switch statement for the appropriate CUDA version and CPU architecture.
+If the loop hangs (e.g. at offset 40), terminate and restart the loop with `i` initialized past the offset that hung (e.g. at offset 48).
diff --git a/libsmctrl.c b/libsmctrl.c
index b10b885..7202572 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -269,19 +269,19 @@ void libsmctrl_set_next_mask(uint64_t mask) {
 // 12.0 tested on 525.147.05
 #define CU_12_2_MASK_OFF 0x4e4
 // 12.2 tested on 535.129.03
-// CUDA 12.3 UNTESTED
+#define CU_12_3_MASK_OFF 0x49c
+// 12.3 tested on 545.29.06
 #define CU_12_4_MASK_OFF 0x4ac
 // 12.4 tested on 550.54.14 and 550.54.15
-// CUDA 12.5 UNTESTED
+#define CU_12_5_MASK_OFF 0x4ec
-#define CU_12_6_MASK_OFF 0x4ec
+// 12.5 tested on 555.58.02
 // 12.6 tested on 560.35.03
-// Offsets for the stream struct on aarch64
+// Offsets for the stream struct on Jetson aarch64
-// All tested on Nov 13th, 2023
+#define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 (Nov 2023)
-#define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2
+#define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier (Nov 2023)
-#define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier
+#define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin (Nov 2023)
-#define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin
+#define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin (Nov 2024)
-#define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin
 // Used up through CUDA 11.8 in the stream struct
 struct stream_sm_mask {
@@ -323,7 +323,7 @@ int detect_parker_soc() {
 }
 #endif // __aarch64__
-// Should work for CUDA 8.0 through 12.2, plus 12.4 and 12.6
+// Should work for CUDA 8.0 through 12.6
 // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in
 // our header
 void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
@@ -385,11 +385,15 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
        case 12020:
                hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF);
                break;
+        case 12030:
+                hw_mask_v2 = (void*)(stream_struct_base + CU_12_3_MASK_OFF);
+                break;
        case 12040:
                hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF);
                break;
+        case 12050:
        case 12060:
-                hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF);
+                hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF);
                break;
 #elif __aarch64__
        case 9000: {
diff --git a/libsmctrl.h b/libsmctrl.h
index eca1f70..6285de6 100644
--- a/libsmctrl.h
+++ b/libsmctrl.h
@@ -21,7 +21,7 @@ extern void libsmctrl_set_global_mask(uint64_t mask);
 // (overrides global mask)
 // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on
 // @param mask   A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
-// Supported: CUDA 8.0 - CUDA 12.2, plus 12.4 and 12.6
+// Supported: CUDA 8.0 - CUDA 12.6
 extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
 extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask);
 // Set TPC mask for the next kernel launch from the caller's CPU thread
author	Joshua Bakita <bakitajoshua@gmail.com>	2024-11-26 20:05:44 -0500
committer	Joshua Bakita <bakitajoshua@gmail.com>	2024-11-26 20:05:44 -0500
commit	2ad0e819a9a9652f6afc0b6da4d70a1232c124d7 (patch)
tree	7eb8e751257d8e58ff515d121c7e6594cc4e51bf
parent	3f9bda39d84f168c1b9f8c26075a72574645f00f (diff)