diff options
| author | Joshua Bakita <bakitajoshua@gmail.com> | 2024-11-26 20:05:44 -0500 |
|---|---|---|
| committer | Joshua Bakita <bakitajoshua@gmail.com> | 2024-11-26 20:05:44 -0500 |
| commit | 2ad0e819a9a9652f6afc0b6da4d70a1232c124d7 (patch) | |
| tree | 7eb8e751257d8e58ff515d121c7e6594cc4e51bf | |
| parent | 3f9bda39d84f168c1b9f8c26075a72574645f00f (diff) | |
Support stream masking on CUDA 12.3 (x86) and 12.5 (x86)
| -rw-r--r-- | README.md | 4 | ||||
| -rw-r--r-- | libsmctrl.c | 26 | ||||
| -rw-r--r-- | libsmctrl.h | 2 |
3 files changed, 19 insertions, 13 deletions
| @@ -93,7 +93,7 @@ make tests | |||
| 93 | #### Known Working | 93 | #### Known Working |
| 94 | 94 | ||
| 95 | - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs | 95 | - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs |
| 96 | - CUDA 8.0 through 12.2, plus 12.4 and 12.6 | 96 | - CUDA 8.0 through 12.6 |
| 97 | - `x86_64` and Jetson `aarch64` platforms | 97 | - `x86_64` and Jetson `aarch64` platforms |
| 98 | 98 | ||
| 99 | #### Known Issues | 99 | #### Known Issues |
| @@ -129,3 +129,5 @@ How this works: | |||
| 129 | 3. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats. | 129 | 3. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats. |
| 130 | 130 | ||
| 131 | Once this loop aborts, take the found offset and add it into the switch statement for the appropriate CUDA version and CPU architecture. | 131 | Once this loop aborts, take the found offset and add it into the switch statement for the appropriate CUDA version and CPU architecture. |
| 132 | |||
| 133 | If the loop hangs (e.g. at offset 40), terminate and restart the loop with `i` initialized past the offset that hung (e.g. at offset 48). | ||
diff --git a/libsmctrl.c b/libsmctrl.c index b10b885..7202572 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
| @@ -269,19 +269,19 @@ void libsmctrl_set_next_mask(uint64_t mask) { | |||
| 269 | // 12.0 tested on 525.147.05 | 269 | // 12.0 tested on 525.147.05 |
| 270 | #define CU_12_2_MASK_OFF 0x4e4 | 270 | #define CU_12_2_MASK_OFF 0x4e4 |
| 271 | // 12.2 tested on 535.129.03 | 271 | // 12.2 tested on 535.129.03 |
| 272 | // CUDA 12.3 UNTESTED | 272 | #define CU_12_3_MASK_OFF 0x49c |
| 273 | // 12.3 tested on 545.29.06 | ||
| 273 | #define CU_12_4_MASK_OFF 0x4ac | 274 | #define CU_12_4_MASK_OFF 0x4ac |
| 274 | // 12.4 tested on 550.54.14 and 550.54.15 | 275 | // 12.4 tested on 550.54.14 and 550.54.15 |
| 275 | // CUDA 12.5 UNTESTED | 276 | #define CU_12_5_MASK_OFF 0x4ec |
| 276 | #define CU_12_6_MASK_OFF 0x4ec | 277 | // 12.5 tested on 555.58.02 |
| 277 | // 12.6 tested on 560.35.03 | 278 | // 12.6 tested on 560.35.03 |
| 278 | 279 | ||
| 279 | // Offsets for the stream struct on aarch64 | 280 | // Offsets for the stream struct on Jetson aarch64 |
| 280 | // All tested on Nov 13th, 2023 | 281 | #define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 (Nov 2023) |
| 281 | #define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 | 282 | #define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier (Nov 2023) |
| 282 | #define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier | 283 | #define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin (Nov 2023) |
| 283 | #define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin | 284 | #define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin (Nov 2024) |
| 284 | #define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin | ||
| 285 | 285 | ||
| 286 | // Used up through CUDA 11.8 in the stream struct | 286 | // Used up through CUDA 11.8 in the stream struct |
| 287 | struct stream_sm_mask { | 287 | struct stream_sm_mask { |
| @@ -323,7 +323,7 @@ int detect_parker_soc() { | |||
| 323 | } | 323 | } |
| 324 | #endif // __aarch64__ | 324 | #endif // __aarch64__ |
| 325 | 325 | ||
| 326 | // Should work for CUDA 8.0 through 12.2, plus 12.4 and 12.6 | 326 | // Should work for CUDA 8.0 through 12.6 |
| 327 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in | 327 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in |
| 328 | // our header | 328 | // our header |
| 329 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | 329 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { |
| @@ -385,11 +385,15 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) { | |||
| 385 | case 12020: | 385 | case 12020: |
| 386 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF); | 386 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF); |
| 387 | break; | 387 | break; |
| 388 | case 12030: | ||
| 389 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_3_MASK_OFF); | ||
| 390 | break; | ||
| 388 | case 12040: | 391 | case 12040: |
| 389 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF); | 392 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF); |
| 390 | break; | 393 | break; |
| 394 | case 12050: | ||
| 391 | case 12060: | 395 | case 12060: |
| 392 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF); | 396 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF); |
| 393 | break; | 397 | break; |
| 394 | #elif __aarch64__ | 398 | #elif __aarch64__ |
| 395 | case 9000: { | 399 | case 9000: { |
diff --git a/libsmctrl.h b/libsmctrl.h index eca1f70..6285de6 100644 --- a/libsmctrl.h +++ b/libsmctrl.h | |||
| @@ -21,7 +21,7 @@ extern void libsmctrl_set_global_mask(uint64_t mask); | |||
| 21 | // (overrides global mask) | 21 | // (overrides global mask) |
| 22 | // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on | 22 | // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on |
| 23 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 23 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
| 24 | // Supported: CUDA 8.0 - CUDA 12.2, plus 12.4 and 12.6 | 24 | // Supported: CUDA 8.0 - CUDA 12.6 |
| 25 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); | 25 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); |
| 26 | extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); | 26 | extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); |
| 27 | // Set TPC mask for the next kernel launch from the caller's CPU thread | 27 | // Set TPC mask for the next kernel launch from the caller's CPU thread |
