aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2023-11-29 17:52:28 -0500
committerJoshua Bakita <bakitajoshua@gmail.com>2023-11-29 18:24:14 -0500
commit3c075c8f71a7c85d735018143fc13a6eb91813eb (patch)
treead31bcb0f409364622c964e5f6d200201287ba5a
parent3ee974590403730f2fea911a2574d335cedc4fab (diff)
Fix stream masking on many platforms and support >64-bit stream masks
Previously did not delineate between aarch64 and x86_64 stream offsets, causing incorrect offsets to be used in many circumstances. This has now been fixed. A new function, libsmctrl_set_stream_mask_ext() has also been added which supports masking up to 128 TPCs (rather than just 64).
-rw-r--r--libsmctrl.c132
-rw-r--r--libsmctrl.h2
2 files changed, 96 insertions, 38 deletions
diff --git a/libsmctrl.c b/libsmctrl.c
index f932b5f..526331f 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -30,6 +30,8 @@
30 30
31#include <dlfcn.h> 31#include <dlfcn.h>
32 32
33#include "libsmctrl.h"
34
33// In functions that do not return an error code, we favor terminating with an 35// In functions that do not return an error code, we favor terminating with an
34// error rather than merely printing a warning and continuing. 36// error rather than merely printing a warning and continuing.
35#define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ 37#define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \
@@ -235,28 +237,48 @@ void libsmctrl_set_next_mask(uint64_t mask) {
235 237
236/*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/ 238/*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/
237 239
240// Offsets for the stream struct on x86_64
238#define CU_8_0_MASK_OFF 0xec 241#define CU_8_0_MASK_OFF 0xec
239#define CU_9_0_MASK_OFF 0x130 242#define CU_9_0_MASK_OFF 0x130
240#define CU_9_0_MASK_OFF_TX2 0x128 // CUDA 9.0 is slightly different on the TX2
241// CUDA 9.0 and 9.1 use the same offset 243// CUDA 9.0 and 9.1 use the same offset
244// 9.1 tested on 390.157
242#define CU_9_2_MASK_OFF 0x140 245#define CU_9_2_MASK_OFF 0x140
243#define CU_10_0_MASK_OFF 0x24c 246#define CU_10_0_MASK_OFF 0x244
244// CUDA 10.0, 10.1 and 10.2 use the same offset 247// CUDA 10.0, 10.1 and 10.2 use the same offset
248// 10.1 tested on 418.113
249// 10.2 tested on 440.100, 440.82, 440.64, and 440.36
245#define CU_11_0_MASK_OFF 0x274 250#define CU_11_0_MASK_OFF 0x274
246#define CU_11_1_MASK_OFF 0x2c4 251#define CU_11_1_MASK_OFF 0x2c4
247#define CU_11_2_MASK_OFF 0x37c 252#define CU_11_2_MASK_OFF 0x37c
248// CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset 253// CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset
254// 11.4 tested on 470.223.02
249#define CU_11_6_MASK_OFF 0x38c 255#define CU_11_6_MASK_OFF 0x38c
250#define CU_11_7_MASK_OFF 0x3c4 256#define CU_11_7_MASK_OFF 0x3c4
251#define CU_11_8_MASK_OFF 0x47c 257#define CU_11_8_MASK_OFF 0x47c
258// 11.8 tested on 520.56.06
252#define CU_12_0_MASK_OFF 0x4cc 259#define CU_12_0_MASK_OFF 0x4cc
253// CUDA 12.0 and 12.1 use the same offset 260// CUDA 12.0 and 12.1 use the same offset
261// 12.0 tested on 525.147.05
262#define CU_12_2_MASK_OFF 0x4e4
263// 12.2 tested on 535.129.03
264
265// Offsets for the stream struct on aarch64
266// All tested on Nov 13th, 2023
267#define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2
268#define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier
269#define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin
254 270
255// Layout in CUDA's `stream` struct 271// Used up through CUDA 11.8 in the stream struct
256struct stream_sm_mask { 272struct stream_sm_mask {
257 uint32_t upper; 273 uint32_t upper;
258 uint32_t lower; 274 uint32_t lower;
259} __attribute__((packed)); 275};
276
277// Used starting with CUDA 12.0 in the stream struct
278struct stream_sm_mask_v2 {
279 uint32_t enabled;
280 uint32_t mask[4];
281};
260 282
261// Check if this system has a Parker SoC (TX2/PX2 chip) 283// Check if this system has a Parker SoC (TX2/PX2 chip)
262// (CUDA 9.0 behaves slightly different on this platform.) 284// (CUDA 9.0 behaves slightly different on this platform.)
@@ -286,36 +308,29 @@ int detect_parker_soc() {
286} 308}
287#endif // __aarch64__ 309#endif // __aarch64__
288 310
289// Should work for CUDA 8.0 through 12.1 311// Should work for CUDA 8.0 through 12.2
290// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in 312// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in
291// our header 313// our header
292void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { 314void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
315 uint128_t full_mask = -1;
316 full_mask <<= 64;
317 full_mask |= mask;
318 libsmctrl_set_stream_mask_ext(stream, full_mask);
319}
320
321void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
293 char* stream_struct_base = *(char**)stream; 322 char* stream_struct_base = *(char**)stream;
294 struct stream_sm_mask* hw_mask; 323 struct stream_sm_mask* hw_mask = NULL;
324 struct stream_sm_mask_v2* hw_mask_v2 = NULL;
295 int ver; 325 int ver;
296 cuDriverGetVersion(&ver); 326 cuDriverGetVersion(&ver);
297 switch (ver) { 327 switch (ver) {
328#if __x86_64__
298 case 8000: 329 case 8000:
299 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); 330 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF);
300 case 9000: 331 case 9000:
301 case 9010: { 332 case 9010: {
302 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); 333 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF);
303#if __aarch64__
304 // Jetson TX2 offset is slightly different on CUDA 9.0.
305 // Only compile the check into ARM64 builds.
306 int is_parker;
307 const char* err_str;
308 if ((is_parker = detect_parker_soc()) < 0) {
309 cuGetErrorName(-is_parker, &err_str);
310 fprintf(stderr, "libsmctrl_set_stream_mask: CUDA call "
311 "failed while doing compatibilty test."
312 "Error, '%s'. Not applying stream "
313 "mask.\n", err_str);
314 }
315
316 if (is_parker)
317 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_TX2);
318#endif
319 break; 334 break;
320 } 335 }
321 case 9020: 336 case 9020:
@@ -349,25 +364,66 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
349 break; 364 break;
350 case 12000: 365 case 12000:
351 case 12010: 366 case 12010:
352 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF); 367 hw_mask_v2 = (void*)(stream_struct_base + CU_12_0_MASK_OFF);
368 break;
369 case 12020:
370 hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF);
371 break;
372#elif __aarch64__
373 case 9000: {
374 // Jetson TX2 offset is slightly different on CUDA 9.0.
375 // Only compile the check into ARM64 builds.
376 // TODO: Always verify Jetson-board-only on aarch64.
377 int is_parker;
378 const char* err_str;
379 if ((is_parker = detect_parker_soc()) < 0) {
380 cuGetErrorName(-is_parker, &err_str);
381 abort(1, 0, "While performing platform-specific "
382 "compatibility checks for stream masking, "
383 "CUDA call failed with error '%s'.", err_str);
384 }
385
386 if (!is_parker)
387 abort(1, 0, "Not supported on non-Jetson aarch64.");
388 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_JETSON);
353 break; 389 break;
354 default: { 390 }
355 // For experimenting to determine the right mask offset, set the MASK_OFF 391 case 10020:
356 // environment variable (positive and negative numbers are supported) 392 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_2_MASK_OFF_JETSON);
357 char* mask_off_str = getenv("MASK_OFF"); 393 break;
358 fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver); 394 case 11040:
359 if (mask_off_str) { 395 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON);
360 int off = atoi(mask_off_str); 396 break;
361 fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x " 397#endif
362 "(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off); 398 }
363 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off); 399
364 } else { 400 // For experimenting to determine the right mask offset, set the MASK_OFF
365 return; 401 // environment variable (positive and negative numbers are supported)
366 }} 402 char* mask_off_str = getenv("MASK_OFF");
403 if (mask_off_str) {
404 int off = atoi(mask_off_str);
405 fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.2 base %#x "
406 "(total off: %#x)\n", off, CU_12_2_MASK_OFF, CU_12_2_MASK_OFF + off);
407 if (CU_12_2_MASK_OFF + off < 0)
408 abort(1, 0, "Total offset cannot be less than 0! Aborting...");
409 // +4 bytes to convert a mask found with this for use with hw_mask
410 hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF + off);
367 } 411 }
368 412
369 hw_mask->upper = mask >> 32; 413 // Mask layout changed with CUDA 12.0 to support large Hopper/Ada GPUs
370 hw_mask->lower = mask; 414 if (hw_mask) {
415 hw_mask->upper = mask >> 32;
416 hw_mask->lower = mask;
417 } else if (hw_mask_v2) {
418 hw_mask_v2->enabled = 1;
419 hw_mask_v2->mask[0] = mask;
420 hw_mask_v2->mask[1] = mask >> 32;
421 hw_mask_v2->mask[2] = mask >> 64;
422 hw_mask_v2->mask[3] = mask >> 96;
423 } else {
424 abort(1, 0, "Stream masking unsupported on this CUDA version (%d), and"
425 " no fallback MASK_OFF set!", ver);
426 }
371} 427}
372 428
373/* INFORMATIONAL FUNCTIONS */ 429/* INFORMATIONAL FUNCTIONS */
diff --git a/libsmctrl.h b/libsmctrl.h
index 990d434..a8207b4 100644
--- a/libsmctrl.h
+++ b/libsmctrl.h
@@ -9,6 +9,7 @@ extern "C" {
9#endif 9#endif
10 10
11#include <stdint.h> 11#include <stdint.h>
12typedef unsigned __int128 uint128_t;
12 13
13/* PARTITIONING FUNCTIONS */ 14/* PARTITIONING FUNCTIONS */
14 15
@@ -22,6 +23,7 @@ extern void libsmctrl_set_global_mask(uint64_t mask);
22// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) 23// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
23// Supported: CUDA 8.0 - CUDA 12.1 24// Supported: CUDA 8.0 - CUDA 12.1
24extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); 25extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
26extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask);
25// Set TPC mask for the next kernel launch from the caller's CPU thread 27// Set TPC mask for the next kernel launch from the caller's CPU thread
26// (overrides global and per-stream masks, applies only to next launch). 28// (overrides global and per-stream masks, applies only to next launch).
27// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) 29// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)