diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2023-11-29 17:52:28 -0500 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2023-11-29 18:24:14 -0500 |
commit | 3c075c8f71a7c85d735018143fc13a6eb91813eb (patch) | |
tree | ad31bcb0f409364622c964e5f6d200201287ba5a | |
parent | 3ee974590403730f2fea911a2574d335cedc4fab (diff) |
Fix stream masking on many platforms and support >64-bit stream masks
Previously did not delineate between aarch64 and x86_64 stream
offsets, causing incorrect offsets to be used in many circumstances.
This has now been fixed.
A new function, libsmctrl_set_stream_mask_ext() has also been added
which supports masking up to 128 TPCs (rather than just 64).
-rw-r--r-- | libsmctrl.c | 132 | ||||
-rw-r--r-- | libsmctrl.h | 2 |
2 files changed, 96 insertions, 38 deletions
diff --git a/libsmctrl.c b/libsmctrl.c index f932b5f..526331f 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
@@ -30,6 +30,8 @@ | |||
30 | 30 | ||
31 | #include <dlfcn.h> | 31 | #include <dlfcn.h> |
32 | 32 | ||
33 | #include "libsmctrl.h" | ||
34 | |||
33 | // In functions that do not return an error code, we favor terminating with an | 35 | // In functions that do not return an error code, we favor terminating with an |
34 | // error rather than merely printing a warning and continuing. | 36 | // error rather than merely printing a warning and continuing. |
35 | #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ | 37 | #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ |
@@ -235,28 +237,48 @@ void libsmctrl_set_next_mask(uint64_t mask) { | |||
235 | 237 | ||
236 | /*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/ | 238 | /*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/ |
237 | 239 | ||
240 | // Offsets for the stream struct on x86_64 | ||
238 | #define CU_8_0_MASK_OFF 0xec | 241 | #define CU_8_0_MASK_OFF 0xec |
239 | #define CU_9_0_MASK_OFF 0x130 | 242 | #define CU_9_0_MASK_OFF 0x130 |
240 | #define CU_9_0_MASK_OFF_TX2 0x128 // CUDA 9.0 is slightly different on the TX2 | ||
241 | // CUDA 9.0 and 9.1 use the same offset | 243 | // CUDA 9.0 and 9.1 use the same offset |
244 | // 9.1 tested on 390.157 | ||
242 | #define CU_9_2_MASK_OFF 0x140 | 245 | #define CU_9_2_MASK_OFF 0x140 |
243 | #define CU_10_0_MASK_OFF 0x24c | 246 | #define CU_10_0_MASK_OFF 0x244 |
244 | // CUDA 10.0, 10.1 and 10.2 use the same offset | 247 | // CUDA 10.0, 10.1 and 10.2 use the same offset |
248 | // 10.1 tested on 418.113 | ||
249 | // 10.2 tested on 440.100, 440.82, 440.64, and 440.36 | ||
245 | #define CU_11_0_MASK_OFF 0x274 | 250 | #define CU_11_0_MASK_OFF 0x274 |
246 | #define CU_11_1_MASK_OFF 0x2c4 | 251 | #define CU_11_1_MASK_OFF 0x2c4 |
247 | #define CU_11_2_MASK_OFF 0x37c | 252 | #define CU_11_2_MASK_OFF 0x37c |
248 | // CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset | 253 | // CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset |
254 | // 11.4 tested on 470.223.02 | ||
249 | #define CU_11_6_MASK_OFF 0x38c | 255 | #define CU_11_6_MASK_OFF 0x38c |
250 | #define CU_11_7_MASK_OFF 0x3c4 | 256 | #define CU_11_7_MASK_OFF 0x3c4 |
251 | #define CU_11_8_MASK_OFF 0x47c | 257 | #define CU_11_8_MASK_OFF 0x47c |
258 | // 11.8 tested on 520.56.06 | ||
252 | #define CU_12_0_MASK_OFF 0x4cc | 259 | #define CU_12_0_MASK_OFF 0x4cc |
253 | // CUDA 12.0 and 12.1 use the same offset | 260 | // CUDA 12.0 and 12.1 use the same offset |
261 | // 12.0 tested on 525.147.05 | ||
262 | #define CU_12_2_MASK_OFF 0x4e4 | ||
263 | // 12.2 tested on 535.129.03 | ||
264 | |||
265 | // Offsets for the stream struct on aarch64 | ||
266 | // All tested on Nov 13th, 2023 | ||
267 | #define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2 | ||
268 | #define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier | ||
269 | #define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin | ||
254 | 270 | ||
255 | // Layout in CUDA's `stream` struct | 271 | // Used up through CUDA 11.8 in the stream struct |
256 | struct stream_sm_mask { | 272 | struct stream_sm_mask { |
257 | uint32_t upper; | 273 | uint32_t upper; |
258 | uint32_t lower; | 274 | uint32_t lower; |
259 | } __attribute__((packed)); | 275 | }; |
276 | |||
277 | // Used starting with CUDA 12.0 in the stream struct | ||
278 | struct stream_sm_mask_v2 { | ||
279 | uint32_t enabled; | ||
280 | uint32_t mask[4]; | ||
281 | }; | ||
260 | 282 | ||
261 | // Check if this system has a Parker SoC (TX2/PX2 chip) | 283 | // Check if this system has a Parker SoC (TX2/PX2 chip) |
262 | // (CUDA 9.0 behaves slightly different on this platform.) | 284 | // (CUDA 9.0 behaves slightly different on this platform.) |
@@ -286,36 +308,29 @@ int detect_parker_soc() { | |||
286 | } | 308 | } |
287 | #endif // __aarch64__ | 309 | #endif // __aarch64__ |
288 | 310 | ||
289 | // Should work for CUDA 8.0 through 12.1 | 311 | // Should work for CUDA 8.0 through 12.2 |
290 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in | 312 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in |
291 | // our header | 313 | // our header |
292 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | 314 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { |
315 | uint128_t full_mask = -1; | ||
316 | full_mask <<= 64; | ||
317 | full_mask |= mask; | ||
318 | libsmctrl_set_stream_mask_ext(stream, full_mask); | ||
319 | } | ||
320 | |||
321 | void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) { | ||
293 | char* stream_struct_base = *(char**)stream; | 322 | char* stream_struct_base = *(char**)stream; |
294 | struct stream_sm_mask* hw_mask; | 323 | struct stream_sm_mask* hw_mask = NULL; |
324 | struct stream_sm_mask_v2* hw_mask_v2 = NULL; | ||
295 | int ver; | 325 | int ver; |
296 | cuDriverGetVersion(&ver); | 326 | cuDriverGetVersion(&ver); |
297 | switch (ver) { | 327 | switch (ver) { |
328 | #if __x86_64__ | ||
298 | case 8000: | 329 | case 8000: |
299 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); | 330 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); |
300 | case 9000: | 331 | case 9000: |
301 | case 9010: { | 332 | case 9010: { |
302 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); | 333 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); |
303 | #if __aarch64__ | ||
304 | // Jetson TX2 offset is slightly different on CUDA 9.0. | ||
305 | // Only compile the check into ARM64 builds. | ||
306 | int is_parker; | ||
307 | const char* err_str; | ||
308 | if ((is_parker = detect_parker_soc()) < 0) { | ||
309 | cuGetErrorName(-is_parker, &err_str); | ||
310 | fprintf(stderr, "libsmctrl_set_stream_mask: CUDA call " | ||
311 | "failed while doing compatibilty test." | ||
312 | "Error, '%s'. Not applying stream " | ||
313 | "mask.\n", err_str); | ||
314 | } | ||
315 | |||
316 | if (is_parker) | ||
317 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_TX2); | ||
318 | #endif | ||
319 | break; | 334 | break; |
320 | } | 335 | } |
321 | case 9020: | 336 | case 9020: |
@@ -349,25 +364,66 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | |||
349 | break; | 364 | break; |
350 | case 12000: | 365 | case 12000: |
351 | case 12010: | 366 | case 12010: |
352 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); | 367 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_0_MASK_OFF); |
368 | break; | ||
369 | case 12020: | ||
370 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF); | ||
371 | break; | ||
372 | #elif __aarch64__ | ||
373 | case 9000: { | ||
374 | // Jetson TX2 offset is slightly different on CUDA 9.0. | ||
375 | // Only compile the check into ARM64 builds. | ||
376 | // TODO: Always verify Jetson-board-only on aarch64. | ||
377 | int is_parker; | ||
378 | const char* err_str; | ||
379 | if ((is_parker = detect_parker_soc()) < 0) { | ||
380 | cuGetErrorName(-is_parker, &err_str); | ||
381 | abort(1, 0, "While performing platform-specific " | ||
382 | "compatibility checks for stream masking, " | ||
383 | "CUDA call failed with error '%s'.", err_str); | ||
384 | } | ||
385 | |||
386 | if (!is_parker) | ||
387 | abort(1, 0, "Not supported on non-Jetson aarch64."); | ||
388 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_JETSON); | ||
353 | break; | 389 | break; |
354 | default: { | 390 | } |
355 | // For experimenting to determine the right mask offset, set the MASK_OFF | 391 | case 10020: |
356 | // environment variable (positive and negative numbers are supported) | 392 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_2_MASK_OFF_JETSON); |
357 | char* mask_off_str = getenv("MASK_OFF"); | 393 | break; |
358 | fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); | 394 | case 11040: |
359 | if (mask_off_str) { | 395 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON); |
360 | int off = atoi(mask_off_str); | 396 | break; |
361 | fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x " | 397 | #endif |
362 | "(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off); | 398 | } |
363 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off); | 399 | |
364 | } else { | 400 | // For experimenting to determine the right mask offset, set the MASK_OFF |
365 | return; | 401 | // environment variable (positive and negative numbers are supported) |
366 | }} | 402 | char* mask_off_str = getenv("MASK_OFF"); |
403 | if (mask_off_str) { | ||
404 | int off = atoi(mask_off_str); | ||
405 | fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.2 base %#x " | ||
406 | "(total off: %#x)\n", off, CU_12_2_MASK_OFF, CU_12_2_MASK_OFF + off); | ||
407 | if (CU_12_2_MASK_OFF + off < 0) | ||
408 | abort(1, 0, "Total offset cannot be less than 0! Aborting..."); | ||
409 | // +4 bytes to convert a mask found with this for use with hw_mask | ||
410 | hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF + off); | ||
367 | } | 411 | } |
368 | 412 | ||
369 | hw_mask->upper = mask >> 32; | 413 | // Mask layout changed with CUDA 12.0 to support large Hopper/Ada GPUs |
370 | hw_mask->lower = mask; | 414 | if (hw_mask) { |
415 | hw_mask->upper = mask >> 32; | ||
416 | hw_mask->lower = mask; | ||
417 | } else if (hw_mask_v2) { | ||
418 | hw_mask_v2->enabled = 1; | ||
419 | hw_mask_v2->mask[0] = mask; | ||
420 | hw_mask_v2->mask[1] = mask >> 32; | ||
421 | hw_mask_v2->mask[2] = mask >> 64; | ||
422 | hw_mask_v2->mask[3] = mask >> 96; | ||
423 | } else { | ||
424 | abort(1, 0, "Stream masking unsupported on this CUDA version (%d), and" | ||
425 | " no fallback MASK_OFF set!", ver); | ||
426 | } | ||
371 | } | 427 | } |
372 | 428 | ||
373 | /* INFORMATIONAL FUNCTIONS */ | 429 | /* INFORMATIONAL FUNCTIONS */ |
diff --git a/libsmctrl.h b/libsmctrl.h index 990d434..a8207b4 100644 --- a/libsmctrl.h +++ b/libsmctrl.h | |||
@@ -9,6 +9,7 @@ extern "C" { | |||
9 | #endif | 9 | #endif |
10 | 10 | ||
11 | #include <stdint.h> | 11 | #include <stdint.h> |
12 | typedef unsigned __int128 uint128_t; | ||
12 | 13 | ||
13 | /* PARTITIONING FUNCTIONS */ | 14 | /* PARTITIONING FUNCTIONS */ |
14 | 15 | ||
@@ -22,6 +23,7 @@ extern void libsmctrl_set_global_mask(uint64_t mask); | |||
22 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 23 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
23 | // Supported: CUDA 8.0 - CUDA 12.1 | 24 | // Supported: CUDA 8.0 - CUDA 12.1 |
24 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); | 25 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); |
26 | extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask); | ||
25 | // Set TPC mask for the next kernel launch from the caller's CPU thread | 27 | // Set TPC mask for the next kernel launch from the caller's CPU thread |
26 | // (overrides global and per-stream masks, applies only to next launch). | 28 | // (overrides global and per-stream masks, applies only to next launch). |
27 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 29 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |