aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2024-11-26 20:05:44 -0500
committerJoshua Bakita <bakitajoshua@gmail.com>2024-11-26 20:05:44 -0500
commit2ad0e819a9a9652f6afc0b6da4d70a1232c124d7 (patch)
tree7eb8e751257d8e58ff515d121c7e6594cc4e51bf
parent3f9bda39d84f168c1b9f8c26075a72574645f00f (diff)
Support stream masking on CUDA 12.3 (x86) and 12.5 (x86)
-rw-r--r--README.md4
-rw-r--r--libsmctrl.c26
-rw-r--r--libsmctrl.h2
3 files changed, 19 insertions, 13 deletions
diff --git a/README.md b/README.md
index 0c99113..c27ab9b 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ make tests
93#### Known Working 93#### Known Working
94 94
95- NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs 95- NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs
96- CUDA 8.0 through 12.2, plus 12.4 and 12.6 96- CUDA 8.0 through 12.6
97- `x86_64` and Jetson `aarch64` platforms 97- `x86_64` and Jetson `aarch64` platforms
98 98
99#### Known Issues 99#### Known Issues
@@ -129,3 +129,5 @@ How this works:
1293. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats. 1293. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats.
130 130
131Once this loop aborts, take the found offset and add it into the switch statement for the appropriate CUDA version and CPU architecture. 131Once this loop aborts, take the found offset and add it into the switch statement for the appropriate CUDA version and CPU architecture.
132
133If the loop hangs (e.g. at offset 40), terminate and restart the loop with `i` initialized past the offset that hung (e.g. at offset 48).
diff --git a/libsmctrl.c b/libsmctrl.c
index b10b885..7202572 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -269,19 +269,19 @@ void libsmctrl_set_next_mask(uint64_t mask) {
269// 12.0 tested on 525.147.05 269// 12.0 tested on 525.147.05
270#define CU_12_2_MASK_OFF 0x4e4 270#define CU_12_2_MASK_OFF 0x4e4
271// 12.2 tested on 535.129.03 271// 12.2 tested on 535.129.03
272// CUDA 12.3 UNTESTED 272#define CU_12_3_MASK_OFF 0x49c
273// 12.3 tested on 545.29.06
273#define CU_12_4_MASK_OFF 0x4ac 274#define CU_12_4_MASK_OFF 0x4ac
274// 12.4 tested on 550.54.14 and 550.54.15 275// 12.4 tested on 550.54.14 and 550.54.15
275// CUDA 12.5 UNTESTED 276#define CU_12_5_MASK_OFF 0x4ec
276#define CU_12_6_MASK_OFF 0x4ec 277// 12.5 tested on 555.58.02
277// 12.6 tested on 560.35.03 278// 12.6 tested on 560.35.03
278 279
279// Offsets for the stream struct on aarch64 280// Offsets for the stream struct on Jetson aarch64
280// All tested on Nov 13th, 2023 281#define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 (Nov 2023)
281#define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 282#define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier (Nov 2023)
282#define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier 283#define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin (Nov 2023)
283#define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin 284#define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin (Nov 2024)
284#define CU_12_6_MASK_OFF_JETSON 0x514 // Tested on Jetson Orin
285 285
286// Used up through CUDA 11.8 in the stream struct 286// Used up through CUDA 11.8 in the stream struct
287struct stream_sm_mask { 287struct stream_sm_mask {
@@ -323,7 +323,7 @@ int detect_parker_soc() {
323} 323}
324#endif // __aarch64__ 324#endif // __aarch64__
325 325
326// Should work for CUDA 8.0 through 12.2, plus 12.4 and 12.6 326// Should work for CUDA 8.0 through 12.6
327// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in 327// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in
328// our header 328// our header
329void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { 329void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
@@ -385,11 +385,15 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
385 case 12020: 385 case 12020:
386 hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF); 386 hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF);
387 break; 387 break;
388 case 12030:
389 hw_mask_v2 = (void*)(stream_struct_base + CU_12_3_MASK_OFF);
390 break;
388 case 12040: 391 case 12040:
389 hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF); 392 hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF);
390 break; 393 break;
394 case 12050:
391 case 12060: 395 case 12060:
392 hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF); 396 hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF);
393 break; 397 break;
394#elif __aarch64__ 398#elif __aarch64__
395 case 9000: { 399 case 9000: {
diff --git a/libsmctrl.h b/libsmctrl.h
index eca1f70..6285de6 100644
--- a/libsmctrl.h
+++ b/libsmctrl.h
@@ -21,7 +21,7 @@ extern void libsmctrl_set_global_mask(uint64_t mask);
21// (overrides global mask) 21// (overrides global mask)
22// @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on 22// @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on
23// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) 23// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
24// Supported: CUDA 8.0 - CUDA 12.2, plus 12.4 and 12.6 24// Supported: CUDA 8.0 - CUDA 12.6
25extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); 25extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
26extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); 26extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask);
27// Set TPC mask for the next kernel launch from the caller's CPU thread 27// Set TPC mask for the next kernel launch from the caller's CPU thread