diff options
author | Joshua Bakita <jbakita@cs.unc.edu> | 2024-11-26 11:55:26 -0500 |
---|---|---|
committer | Joshua Bakita <jbakita@cs.unc.edu> | 2024-11-26 11:58:37 -0500 |
commit | 3f9bda39d84f168c1b9f8c26075a72574645f00f (patch) | |
tree | 26d7bc0b51138d9873fac86dbbea4067960fbaea | |
parent | ebf2f07de91d9e341acc6df25e928e87b25b958d (diff) |
Support stream masking on CUDA 12.4 (x86) and 12.6 (x86, aarch64)
Credit to Nordine Feddal for testing CUDA 12.4 on 550.544.14.
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | libsmctrl.c | 21 | ||||
-rw-r--r-- | libsmctrl.h | 8 |
3 files changed, 26 insertions, 5 deletions
@@ -93,7 +93,7 @@ make tests | |||
93 | #### Known Working | 93 | #### Known Working |
94 | 94 | ||
95 | - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs | 95 | - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs |
96 | - CUDA 8.1 through 12.2 | 96 | - CUDA 8.0 through 12.2, plus 12.4 and 12.6 |
97 | - `x86_64` and Jetson `aarch64` platforms | 97 | - `x86_64` and Jetson `aarch64` platforms |
98 | 98 | ||
99 | #### Known Issues | 99 | #### Known Issues |
diff --git a/libsmctrl.c b/libsmctrl.c index 817cb5d..b10b885 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
@@ -11,7 +11,7 @@ | |||
11 | * +-----------+---------------+---------------+--------------+ | 11 | * +-----------+---------------+---------------+--------------+ |
12 | * | Version | Global Mask | Stream Mask | Next Mask | | 12 | * | Version | Global Mask | Stream Mask | Next Mask | |
13 | * +-----------+---------------+---------------+--------------+ | 13 | * +-----------+---------------+---------------+--------------+ |
14 | * | 11.0-12.2 | TMD/QMD Hook | stream struct | TMD/QMD Hook | | 14 | * | 11.0-12.6 | TMD/QMD Hook | stream struct | TMD/QMD Hook | |
15 | * | 10.2 | global struct | stream struct | N/A | | 15 | * | 10.2 | global struct | stream struct | N/A | |
16 | * | 8.0-10.1 | N/A | stream struct | N/A | | 16 | * | 8.0-10.1 | N/A | stream struct | N/A | |
17 | * +-----------+---------------+---------------+--------------+ | 17 | * +-----------+---------------+---------------+--------------+ |
@@ -269,12 +269,19 @@ void libsmctrl_set_next_mask(uint64_t mask) { | |||
269 | // 12.0 tested on 525.147.05 | 269 | // 12.0 tested on 525.147.05 |
270 | #define CU_12_2_MASK_OFF 0x4e4 | 270 | #define CU_12_2_MASK_OFF 0x4e4 |
271 | // 12.2 tested on 535.129.03 | 271 | // 12.2 tested on 535.129.03 |
272 | // CUDA 12.3 UNTESTED | ||
273 | #define CU_12_4_MASK_OFF 0x4ac | ||
274 | // 12.4 tested on 550.54.14 and 550.54.15 | ||
275 | // CUDA 12.5 UNTESTED | ||
276 | #define CU_12_6_MASK_OFF 0x4ec | ||
277 | // 12.6 tested on 560.35.03 | ||
272 | 278 | ||
273 | // Offsets for the stream struct on aarch64 | 279 | // Offsets for the stream struct on aarch64 |
274 | // All tested on Nov 13th, 2023 | 280 | // All tested on Nov 13th, 2023 |
275 | #define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 | 281 | #define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 |
276 | #define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier | 282 | #define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier |
277 | #define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin | 283 | #define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin |
284 | #define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin | ||
278 | 285 | ||
279 | // Used up through CUDA 11.8 in the stream struct | 286 | // Used up through CUDA 11.8 in the stream struct |
280 | struct stream_sm_mask { | 287 | struct stream_sm_mask { |
@@ -316,10 +323,11 @@ int detect_parker_soc() { | |||
316 | } | 323 | } |
317 | #endif // __aarch64__ | 324 | #endif // __aarch64__ |
318 | 325 | ||
319 | // Should work for CUDA 8.0 through 12.2 | 326 | // Should work for CUDA 8.0 through 12.2, plus 12.4 and 12.6 |
320 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in | 327 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in |
321 | // our header | 328 | // our header |
322 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | 329 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { |
330 | // When the old API is used on GPUs with over 64 TPCs, disable all TPCs >64 | ||
323 | uint128_t full_mask = -1; | 331 | uint128_t full_mask = -1; |
324 | full_mask <<= 64; | 332 | full_mask <<= 64; |
325 | full_mask |= mask; | 333 | full_mask |= mask; |
@@ -377,6 +385,12 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) { | |||
377 | case 12020: | 385 | case 12020: |
378 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF); | 386 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF); |
379 | break; | 387 | break; |
388 | case 12040: | ||
389 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF); | ||
390 | break; | ||
391 | case 12060: | ||
392 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF); | ||
393 | break; | ||
380 | #elif __aarch64__ | 394 | #elif __aarch64__ |
381 | case 9000: { | 395 | case 9000: { |
382 | // Jetson TX2 offset is slightly different on CUDA 9.0. | 396 | // Jetson TX2 offset is slightly different on CUDA 9.0. |
@@ -402,6 +416,9 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) { | |||
402 | case 11040: | 416 | case 11040: |
403 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON); | 417 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON); |
404 | break; | 418 | break; |
419 | case 12060: | ||
420 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_6_MASK_OFF_JETSON); | ||
421 | break; | ||
405 | #endif | 422 | #endif |
406 | } | 423 | } |
407 | 424 | ||
diff --git a/libsmctrl.h b/libsmctrl.h index a8207b4..eca1f70 100644 --- a/libsmctrl.h +++ b/libsmctrl.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /** | 1 | /** |
2 | * Copyright 2023 Joshua Bakita | 2 | * Copyright 2024 Joshua Bakita |
3 | * Library to control TPC masks on CUDA launches. Co-opts preexisting debug | 3 | * Library to control TPC masks on CUDA launches. Co-opts preexisting debug |
4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. | 4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. |
5 | */ | 5 | */ |
@@ -21,7 +21,7 @@ extern void libsmctrl_set_global_mask(uint64_t mask); | |||
21 | // (overrides global mask) | 21 | // (overrides global mask) |
22 | // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on | 22 | // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on |
23 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 23 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
24 | // Supported: CUDA 8.0 - CUDA 12.1 | 24 | // Supported: CUDA 8.0 - CUDA 12.2, plus 12.4 and 12.6 |
25 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); | 25 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); |
26 | extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); | 26 | extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); |
27 | // Set TPC mask for the next kernel launch from the caller's CPU thread | 27 | // Set TPC mask for the next kernel launch from the caller's CPU thread |
@@ -47,6 +47,10 @@ extern void libsmctrl_set_next_mask(uint64_t mask); | |||
47 | * | 47 | * |
48 | * Note that the bitwise inversion operator (~, as used above) is very useful, | 48 | * Note that the bitwise inversion operator (~, as used above) is very useful, |
49 | * just be sure to apply it to 64-bit integer literals only! (~0x1 != ~0x1ull) | 49 | * just be sure to apply it to 64-bit integer literals only! (~0x1 != ~0x1ull) |
50 | * | ||
51 | * On GPUs with over 64 TPCs, use the _mask_ext() functions to support 128-bit | ||
52 | * masks. If using a 64-bit mask on a GPU with more than 64 TPCs, all TPCs with | ||
53 | * IDs over 64 will be disabled. | ||
50 | */ | 54 | */ |
51 | 55 | ||
52 | /* INFORMATIONAL FUNCTIONS */ | 56 | /* INFORMATIONAL FUNCTIONS */ |