diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2024-11-26 20:05:44 -0500 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2024-11-26 20:05:44 -0500 |
commit | 2ad0e819a9a9652f6afc0b6da4d70a1232c124d7 (patch) | |
tree | 7eb8e751257d8e58ff515d121c7e6594cc4e51bf | |
parent | 3f9bda39d84f168c1b9f8c26075a72574645f00f (diff) |
Support stream masking on CUDA 12.3 (x86) and 12.5 (x86)
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | libsmctrl.c | 26 | ||||
-rw-r--r-- | libsmctrl.h | 2 |
3 files changed, 19 insertions, 13 deletions
@@ -93,7 +93,7 @@ make tests | |||
93 | #### Known Working | 93 | #### Known Working |
94 | 94 | ||
95 | - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs | 95 | - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs |
96 | - CUDA 8.0 through 12.2, plus 12.4 and 12.6 | 96 | - CUDA 8.0 through 12.6 |
97 | - `x86_64` and Jetson `aarch64` platforms | 97 | - `x86_64` and Jetson `aarch64` platforms |
98 | 98 | ||
99 | #### Known Issues | 99 | #### Known Issues |
@@ -129,3 +129,5 @@ How this works: | |||
129 | 3. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats. | 129 | 3. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats. |
130 | 130 | ||
131 | Once this loop aborts, take the found offset and add it into the switch statement for the appropriate CUDA version and CPU architecture. | 131 | Once this loop aborts, take the found offset and add it into the switch statement for the appropriate CUDA version and CPU architecture. |
132 | |||
133 | If the loop hangs (e.g. at offset 40), terminate and restart the loop with `i` initialized past the offset that hung (e.g. at offset 48). | ||
diff --git a/libsmctrl.c b/libsmctrl.c index b10b885..7202572 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
@@ -269,19 +269,19 @@ void libsmctrl_set_next_mask(uint64_t mask) { | |||
269 | // 12.0 tested on 525.147.05 | 269 | // 12.0 tested on 525.147.05 |
270 | #define CU_12_2_MASK_OFF 0x4e4 | 270 | #define CU_12_2_MASK_OFF 0x4e4 |
271 | // 12.2 tested on 535.129.03 | 271 | // 12.2 tested on 535.129.03 |
272 | // CUDA 12.3 UNTESTED | 272 | #define CU_12_3_MASK_OFF 0x49c |
273 | // 12.3 tested on 545.29.06 | ||
273 | #define CU_12_4_MASK_OFF 0x4ac | 274 | #define CU_12_4_MASK_OFF 0x4ac |
274 | // 12.4 tested on 550.54.14 and 550.54.15 | 275 | // 12.4 tested on 550.54.14 and 550.54.15 |
275 | // CUDA 12.5 UNTESTED | 276 | #define CU_12_5_MASK_OFF 0x4ec |
276 | #define CU_12_6_MASK_OFF 0x4ec | 277 | // 12.5 tested on 555.58.02 |
277 | // 12.6 tested on 560.35.03 | 278 | // 12.6 tested on 560.35.03 |
278 | 279 | ||
279 | // Offsets for the stream struct on aarch64 | 280 | // Offsets for the stream struct on Jetson aarch64 |
280 | // All tested on Nov 13th, 2023 | 281 | #define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 (Nov 2023) |
281 | #define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 | 282 | #define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier (Nov 2023) |
282 | #define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier | 283 | #define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin (Nov 2023) |
283 | #define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin | 284 | #define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin (Nov 2024) |
284 | #define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin | ||
285 | 285 | ||
286 | // Used up through CUDA 11.8 in the stream struct | 286 | // Used up through CUDA 11.8 in the stream struct |
287 | struct stream_sm_mask { | 287 | struct stream_sm_mask { |
@@ -323,7 +323,7 @@ int detect_parker_soc() { | |||
323 | } | 323 | } |
324 | #endif // __aarch64__ | 324 | #endif // __aarch64__ |
325 | 325 | ||
326 | // Should work for CUDA 8.0 through 12.2, plus 12.4 and 12.6 | 326 | // Should work for CUDA 8.0 through 12.6 |
327 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in | 327 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in |
328 | // our header | 328 | // our header |
329 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | 329 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { |
@@ -385,11 +385,15 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) { | |||
385 | case 12020: | 385 | case 12020: |
386 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF); | 386 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF); |
387 | break; | 387 | break; |
388 | case 12030: | ||
389 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_3_MASK_OFF); | ||
390 | break; | ||
388 | case 12040: | 391 | case 12040: |
389 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF); | 392 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF); |
390 | break; | 393 | break; |
394 | case 12050: | ||
391 | case 12060: | 395 | case 12060: |
392 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF); | 396 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF); |
393 | break; | 397 | break; |
394 | #elif __aarch64__ | 398 | #elif __aarch64__ |
395 | case 9000: { | 399 | case 9000: { |
diff --git a/libsmctrl.h b/libsmctrl.h index eca1f70..6285de6 100644 --- a/libsmctrl.h +++ b/libsmctrl.h | |||
@@ -21,7 +21,7 @@ extern void libsmctrl_set_global_mask(uint64_t mask); | |||
21 | // (overrides global mask) | 21 | // (overrides global mask) |
22 | // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on | 22 | // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on |
23 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 23 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
24 | // Supported: CUDA 8.0 - CUDA 12.2, plus 12.4 and 12.6 | 24 | // Supported: CUDA 8.0 - CUDA 12.6 |
25 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); | 25 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); |
26 | extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); | 26 | extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); |
27 | // Set TPC mask for the next kernel launch from the caller's CPU thread | 27 | // Set TPC mask for the next kernel launch from the caller's CPU thread |