diff options
| author | Joshua Bakita <bakitajoshua@gmail.com> | 2023-11-29 17:52:28 -0500 |
|---|---|---|
| committer | Joshua Bakita <bakitajoshua@gmail.com> | 2023-11-29 18:24:14 -0500 |
| commit | 3c075c8f71a7c85d735018143fc13a6eb91813eb (patch) | |
| tree | ad31bcb0f409364622c964e5f6d200201287ba5a | |
| parent | 3ee974590403730f2fea911a2574d335cedc4fab (diff) | |
Fix stream masking on many platforms and support >64-bit stream masks
Previously did not delineate between aarch64 and x86_64 stream
offsets, causing incorrect offsets to be used in many circumstances.
This has now been fixed.
A new function, libsmctrl_set_stream_mask_ext() has also been added
which supports masking up to 128 TPCs (rather than just 64).
| -rw-r--r-- | libsmctrl.c | 132 | ||||
| -rw-r--r-- | libsmctrl.h | 2 |
2 files changed, 96 insertions, 38 deletions
diff --git a/libsmctrl.c b/libsmctrl.c index f932b5f..526331f 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
| @@ -30,6 +30,8 @@ | |||
| 30 | 30 | ||
| 31 | #include <dlfcn.h> | 31 | #include <dlfcn.h> |
| 32 | 32 | ||
| 33 | #include "libsmctrl.h" | ||
| 34 | |||
| 33 | // In functions that do not return an error code, we favor terminating with an | 35 | // In functions that do not return an error code, we favor terminating with an |
| 34 | // error rather than merely printing a warning and continuing. | 36 | // error rather than merely printing a warning and continuing. |
| 35 | #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ | 37 | #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ |
| @@ -235,28 +237,48 @@ void libsmctrl_set_next_mask(uint64_t mask) { | |||
| 235 | 237 | ||
| 236 | /*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/ | 238 | /*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/ |
| 237 | 239 | ||
| 240 | // Offsets for the stream struct on x86_64 | ||
| 238 | #define CU_8_0_MASK_OFF 0xec | 241 | #define CU_8_0_MASK_OFF 0xec |
| 239 | #define CU_9_0_MASK_OFF 0x130 | 242 | #define CU_9_0_MASK_OFF 0x130 |
| 240 | #define CU_9_0_MASK_OFF_TX2 0x128 // CUDA 9.0 is slightly different on the TX2 | ||
| 241 | // CUDA 9.0 and 9.1 use the same offset | 243 | // CUDA 9.0 and 9.1 use the same offset |
| 244 | // 9.1 tested on 390.157 | ||
| 242 | #define CU_9_2_MASK_OFF 0x140 | 245 | #define CU_9_2_MASK_OFF 0x140 |
| 243 | #define CU_10_0_MASK_OFF 0x24c | 246 | #define CU_10_0_MASK_OFF 0x244 |
| 244 | // CUDA 10.0, 10.1 and 10.2 use the same offset | 247 | // CUDA 10.0, 10.1 and 10.2 use the same offset |
| 248 | // 10.1 tested on 418.113 | ||
| 249 | // 10.2 tested on 440.100, 440.82, 440.64, and 440.36 | ||
| 245 | #define CU_11_0_MASK_OFF 0x274 | 250 | #define CU_11_0_MASK_OFF 0x274 |
| 246 | #define CU_11_1_MASK_OFF 0x2c4 | 251 | #define CU_11_1_MASK_OFF 0x2c4 |
| 247 | #define CU_11_2_MASK_OFF 0x37c | 252 | #define CU_11_2_MASK_OFF 0x37c |
| 248 | // CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset | 253 | // CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset |
| 254 | // 11.4 tested on 470.223.02 | ||
| 249 | #define CU_11_6_MASK_OFF 0x38c | 255 | #define CU_11_6_MASK_OFF 0x38c |
| 250 | #define CU_11_7_MASK_OFF 0x3c4 | 256 | #define CU_11_7_MASK_OFF 0x3c4 |
| 251 | #define CU_11_8_MASK_OFF 0x47c | 257 | #define CU_11_8_MASK_OFF 0x47c |
| 258 | // 11.8 tested on 520.56.06 | ||
| 252 | #define CU_12_0_MASK_OFF 0x4cc | 259 | #define CU_12_0_MASK_OFF 0x4cc |
| 253 | // CUDA 12.0 and 12.1 use the same offset | 260 | // CUDA 12.0 and 12.1 use the same offset |
| 261 | // 12.0 tested on 525.147.05 | ||
| 262 | #define CU_12_2_MASK_OFF 0x4e4 | ||
| 263 | // 12.2 tested on 535.129.03 | ||
| 264 | |||
| 265 | // Offsets for the stream struct on aarch64 | ||
| 266 | // All tested on Nov 13th, 2023 | ||
| 267 | #define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 | ||
| 268 | #define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier | ||
| 269 | #define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin | ||
| 254 | 270 | ||
| 255 | // Layout in CUDA's `stream` struct | 271 | // Used up through CUDA 11.8 in the stream struct |
| 256 | struct stream_sm_mask { | 272 | struct stream_sm_mask { |
| 257 | uint32_t upper; | 273 | uint32_t upper; |
| 258 | uint32_t lower; | 274 | uint32_t lower; |
| 259 | } __attribute__((packed)); | 275 | }; |
| 276 | |||
| 277 | // Used starting with CUDA 12.0 in the stream struct | ||
| 278 | struct stream_sm_mask_v2 { | ||
| 279 | uint32_t enabled; | ||
| 280 | uint32_t mask[4]; | ||
| 281 | }; | ||
| 260 | 282 | ||
| 261 | // Check if this system has a Parker SoC (TX2/PX2 chip) | 283 | // Check if this system has a Parker SoC (TX2/PX2 chip) |
| 262 | // (CUDA 9.0 behaves slightly different on this platform.) | 284 | // (CUDA 9.0 behaves slightly different on this platform.) |
| @@ -286,36 +308,29 @@ int detect_parker_soc() { | |||
| 286 | } | 308 | } |
| 287 | #endif // __aarch64__ | 309 | #endif // __aarch64__ |
| 288 | 310 | ||
| 289 | // Should work for CUDA 8.0 through 12.1 | 311 | // Should work for CUDA 8.0 through 12.2 |
| 290 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in | 312 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in |
| 291 | // our header | 313 | // our header |
| 292 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | 314 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { |
| 315 | uint128_t full_mask = -1; | ||
| 316 | full_mask <<= 64; | ||
| 317 | full_mask |= mask; | ||
| 318 | libsmctrl_set_stream_mask_ext(stream, full_mask); | ||
| 319 | } | ||
| 320 | |||
| 321 | void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) { | ||
| 293 | char* stream_struct_base = *(char**)stream; | 322 | char* stream_struct_base = *(char**)stream; |
| 294 | struct stream_sm_mask* hw_mask; | 323 | struct stream_sm_mask* hw_mask = NULL; |
| 324 | struct stream_sm_mask_v2* hw_mask_v2 = NULL; | ||
| 295 | int ver; | 325 | int ver; |
| 296 | cuDriverGetVersion(&ver); | 326 | cuDriverGetVersion(&ver); |
| 297 | switch (ver) { | 327 | switch (ver) { |
| 328 | #if __x86_64__ | ||
| 298 | case 8000: | 329 | case 8000: |
| 299 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); | 330 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); |
| 300 | case 9000: | 331 | case 9000: |
| 301 | case 9010: { | 332 | case 9010: { |
| 302 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); | 333 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); |
| 303 | #if __aarch64__ | ||
| 304 | // Jetson TX2 offset is slightly different on CUDA 9.0. | ||
| 305 | // Only compile the check into ARM64 builds. | ||
| 306 | int is_parker; | ||
| 307 | const char* err_str; | ||
| 308 | if ((is_parker = detect_parker_soc()) < 0) { | ||
| 309 | cuGetErrorName(-is_parker, &err_str); | ||
| 310 | fprintf(stderr, "libsmctrl_set_stream_mask: CUDA call " | ||
| 311 | "failed while doing compatibilty test." | ||
| 312 | "Error, '%s'. Not applying stream " | ||
| 313 | "mask.\n", err_str); | ||
| 314 | } | ||
| 315 | |||
| 316 | if (is_parker) | ||
| 317 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_TX2); | ||
| 318 | #endif | ||
| 319 | break; | 334 | break; |
| 320 | } | 335 | } |
| 321 | case 9020: | 336 | case 9020: |
| @@ -349,25 +364,66 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | |||
| 349 | break; | 364 | break; |
| 350 | case 12000: | 365 | case 12000: |
| 351 | case 12010: | 366 | case 12010: |
| 352 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); | 367 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_0_MASK_OFF); |
| 368 | break; | ||
| 369 | case 12020: | ||
| 370 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF); | ||
| 371 | break; | ||
| 372 | #elif __aarch64__ | ||
| 373 | case 9000: { | ||
| 374 | // Jetson TX2 offset is slightly different on CUDA 9.0. | ||
| 375 | // Only compile the check into ARM64 builds. | ||
| 376 | // TODO: Always verify Jetson-board-only on aarch64. | ||
| 377 | int is_parker; | ||
| 378 | const char* err_str; | ||
| 379 | if ((is_parker = detect_parker_soc()) < 0) { | ||
| 380 | cuGetErrorName(-is_parker, &err_str); | ||
| 381 | abort(1, 0, "While performing platform-specific " | ||
| 382 | "compatibility checks for stream masking, " | ||
| 383 | "CUDA call failed with error '%s'.", err_str); | ||
| 384 | } | ||
| 385 | |||
| 386 | if (!is_parker) | ||
| 387 | abort(1, 0, "Not supported on non-Jetson aarch64."); | ||
| 388 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_JETSON); | ||
| 353 | break; | 389 | break; |
| 354 | default: { | 390 | } |
| 355 | // For experimenting to determine the right mask offset, set the MASK_OFF | 391 | case 10020: |
| 356 | // environment variable (positive and negative numbers are supported) | 392 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_2_MASK_OFF_JETSON); |
| 357 | char* mask_off_str = getenv("MASK_OFF"); | 393 | break; |
| 358 | fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); | 394 | case 11040: |
| 359 | if (mask_off_str) { | 395 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON); |
| 360 | int off = atoi(mask_off_str); | 396 | break; |
| 361 | fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x " | 397 | #endif |
| 362 | "(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off); | 398 | } |
| 363 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off); | 399 | |
| 364 | } else { | 400 | // For experimenting to determine the right mask offset, set the MASK_OFF |
| 365 | return; | 401 | // environment variable (positive and negative numbers are supported) |
| 366 | }} | 402 | char* mask_off_str = getenv("MASK_OFF"); |
| 403 | if (mask_off_str) { | ||
| 404 | int off = atoi(mask_off_str); | ||
| 405 | fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.2 base %#x " | ||
| 406 | "(total off: %#x)\n", off, CU_12_2_MASK_OFF, CU_12_2_MASK_OFF + off); | ||
| 407 | if (CU_12_2_MASK_OFF + off < 0) | ||
| 408 | abort(1, 0, "Total offset cannot be less than 0! Aborting..."); | ||
| 409 | // +4 bytes to convert a mask found with this for use with hw_mask | ||
| 410 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF + off); | ||
| 367 | } | 411 | } |
| 368 | 412 | ||
| 369 | hw_mask->upper = mask >> 32; | 413 | // Mask layout changed with CUDA 12.0 to support large Hopper/Ada GPUs |
| 370 | hw_mask->lower = mask; | 414 | if (hw_mask) { |
| 415 | hw_mask->upper = mask >> 32; | ||
| 416 | hw_mask->lower = mask; | ||
| 417 | } else if (hw_mask_v2) { | ||
| 418 | hw_mask_v2->enabled = 1; | ||
| 419 | hw_mask_v2->mask[0] = mask; | ||
| 420 | hw_mask_v2->mask[1] = mask >> 32; | ||
| 421 | hw_mask_v2->mask[2] = mask >> 64; | ||
| 422 | hw_mask_v2->mask[3] = mask >> 96; | ||
| 423 | } else { | ||
| 424 | abort(1, 0, "Stream masking unsupported on this CUDA version (%d), and" | ||
| 425 | " no fallback MASK_OFF set!", ver); | ||
| 426 | } | ||
| 371 | } | 427 | } |
| 372 | 428 | ||
| 373 | /* INFORMATIONAL FUNCTIONS */ | 429 | /* INFORMATIONAL FUNCTIONS */ |
diff --git a/libsmctrl.h b/libsmctrl.h index 990d434..a8207b4 100644 --- a/libsmctrl.h +++ b/libsmctrl.h | |||
| @@ -9,6 +9,7 @@ extern "C" { | |||
| 9 | #endif | 9 | #endif |
| 10 | 10 | ||
| 11 | #include <stdint.h> | 11 | #include <stdint.h> |
| 12 | typedef unsigned __int128 uint128_t; | ||
| 12 | 13 | ||
| 13 | /* PARTITIONING FUNCTIONS */ | 14 | /* PARTITIONING FUNCTIONS */ |
| 14 | 15 | ||
| @@ -22,6 +23,7 @@ extern void libsmctrl_set_global_mask(uint64_t mask); | |||
| 22 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 23 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
| 23 | // Supported: CUDA 8.0 - CUDA 12.1 | 24 | // Supported: CUDA 8.0 - CUDA 12.1 |
| 24 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); | 25 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); |
| 26 | extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); | ||
| 25 | // Set TPC mask for the next kernel launch from the caller's CPU thread | 27 | // Set TPC mask for the next kernel launch from the caller's CPU thread |
| 26 | // (overrides global and per-stream masks, applies only to next launch). | 28 | // (overrides global and per-stream masks, applies only to next launch). |
| 27 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 29 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
