Fix stream masking on many platforms and support >64-bit stream masks

Previously did not delineate between aarch64 and x86_64 stream offsets, causing incorrect offsets to be used in many circumstances. This has now been fixed. A new function, libsmctrl_set_stream_mask_ext() has also been added which supports masking up to 128 TPCs (rather than just 64).
author: Joshua Bakita <bakitajoshua@gmail.com> 2023-11-29 17:52:28 -0500
committer: Joshua Bakita <bakitajoshua@gmail.com> 2023-11-29 18:24:14 -0500
commit: 3c075c8f71a7c85d735018143fc13a6eb91813eb (patch)
tree: ad31bcb0f409364622c964e5f6d200201287ba5a
parent: 3ee974590403730f2fea911a2574d335cedc4fab (diff)
2 files changed, 96 insertions, 38 deletions
diff --git a/libsmctrl.c b/libsmctrl.c
index f932b5f..526331f 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -30,6 +30,8 @@
 #include <dlfcn.h>
+#include "libsmctrl.h"
 // In functions that do not return an error code, we favor terminating with an
 // error rather than merely printing a warning and continuing.
 #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \
@@ -235,28 +237,48 @@ void libsmctrl_set_next_mask(uint64_t mask) {
 /*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/
+// Offsets for the stream struct on x86_64
 #define CU_8_0_MASK_OFF 0xec
 #define CU_9_0_MASK_OFF 0x130
-#define CU_9_0_MASK_OFF_TX2 0x128 // CUDA 9.0 is slightly different on the TX2
 // CUDA 9.0 and 9.1 use the same offset
+// 9.1 tested on 390.157
 #define CU_9_2_MASK_OFF 0x140
-#define CU_10_0_MASK_OFF 0x24c
+#define CU_10_0_MASK_OFF 0x244
 // CUDA 10.0, 10.1 and 10.2 use the same offset
+// 10.1 tested on 418.113
+// 10.2 tested on 440.100, 440.82, 440.64, and 440.36
 #define CU_11_0_MASK_OFF 0x274
 #define CU_11_1_MASK_OFF 0x2c4
 #define CU_11_2_MASK_OFF 0x37c
 // CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset
+// 11.4 tested on 470.223.02
 #define CU_11_6_MASK_OFF 0x38c
 #define CU_11_7_MASK_OFF 0x3c4
 #define CU_11_8_MASK_OFF 0x47c
+// 11.8 tested on 520.56.06
 #define CU_12_0_MASK_OFF 0x4cc
 // CUDA 12.0 and 12.1 use the same offset
+// 12.0 tested on 525.147.05
+#define CU_12_2_MASK_OFF 0x4e4
+// 12.2 tested on 535.129.03
+// Offsets for the stream struct on aarch64
+// All tested on Nov 13th, 2023
+#define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2
+#define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier
+#define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin
-// Layout in CUDA's `stream` struct
+// Used up through CUDA 11.8 in the stream struct
 struct stream_sm_mask {
        uint32_t upper;
        uint32_t lower;
-} __attribute__((packed));
+};
+// Used starting with CUDA 12.0 in the stream struct
+struct stream_sm_mask_v2 {
+        uint32_t enabled;
+        uint32_t mask[4];
+};
 // Check if this system has a Parker SoC (TX2/PX2 chip)
 // (CUDA 9.0 behaves slightly different on this platform.)
@@ -286,36 +308,29 @@ int detect_parker_soc() {
 }
 #endif // __aarch64__
-// Should work for CUDA 8.0 through 12.1
+// Should work for CUDA 8.0 through 12.2
 // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in
 // our header
 void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
+        uint128_t full_mask = -1;
+        full_mask <<= 64;
+        full_mask |= mask;
+        libsmctrl_set_stream_mask_ext(stream, full_mask);
+}
+void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
        char* stream_struct_base = *(char**)stream;
-        struct stream_sm_mask* hw_mask;
+        struct stream_sm_mask* hw_mask = NULL;
+        struct stream_sm_mask_v2* hw_mask_v2 = NULL;
        int ver;
        cuDriverGetVersion(&ver);
        switch (ver) {
+#if __x86_64__
        case 8000:
                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF);
        case 9000:
        case 9010: {
                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF);
-#if __aarch64__
-                // Jetson TX2 offset is slightly different on CUDA 9.0.
-                // Only compile the check into ARM64 builds.
-                int is_parker;
-                const char* err_str;
-                if ((is_parker = detect_parker_soc()) < 0) {
-                        cuGetErrorName(-is_parker, &err_str);
-                        fprintf(stderr, "libsmctrl_set_stream_mask: CUDA call "
-                                        "failed while doing compatibilty test."
-                                        "Error, '%s'. Not applying stream "
-                                        "mask.\n", err_str);
-                }
-                if (is_parker)
-                        hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_TX2);
-#endif
                break;
        }
        case 9020:
@@ -349,25 +364,66 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
                break;
        case 12000:
        case 12010:
-                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF);
+                hw_mask_v2 = (void*)(stream_struct_base + CU_12_0_MASK_OFF);
+                break;
+        case 12020:
+                hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF);
+                break;
+#elif __aarch64__
+        case 9000: {
+                // Jetson TX2 offset is slightly different on CUDA 9.0.
+                // Only compile the check into ARM64 builds.
+                // TODO: Always verify Jetson-board-only on aarch64.
+                int is_parker;
+                const char* err_str;
+                if ((is_parker = detect_parker_soc()) < 0) {
+                        cuGetErrorName(-is_parker, &err_str);
+                        abort(1, 0, "While performing platform-specific "
+                                    "compatibility checks for stream masking, "
+                                    "CUDA call failed with error '%s'.", err_str);
+                }
+                if (!is_parker)
+                        abort(1, 0, "Not supported on non-Jetson aarch64.");
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_JETSON);
                break;
-        default: {
+        }
-                // For experimenting to determine the right mask offset, set the MASK_OFF
+        case 10020:
-                // environment variable (positive and negative numbers are supported)
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_2_MASK_OFF_JETSON);
-                char* mask_off_str = getenv("MASK_OFF");
+                break;
-                fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver);
+        case 11040:
-                if (mask_off_str) {
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON);
-                        int off = atoi(mask_off_str);
+                break;
-                        fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x "
+#endif
-                                        "(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off);
+        }
-                        hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off);
-                } else {
+        // For experimenting to determine the right mask offset, set the MASK_OFF
-                        return;
+        // environment variable (positive and negative numbers are supported)
-                }}
+        char* mask_off_str = getenv("MASK_OFF");
+        if (mask_off_str) {
+                int off = atoi(mask_off_str);
+                fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.2 base %#x "
+                                "(total off: %#x)\n", off, CU_12_2_MASK_OFF, CU_12_2_MASK_OFF + off);
+                if (CU_12_2_MASK_OFF + off < 0)
+                        abort(1, 0, "Total offset cannot be less than 0! Aborting...");
+                // +4 bytes to convert a mask found with this for use with hw_mask
+                hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF + off);
        }
-        hw_mask->upper = mask >> 32;
+        // Mask layout changed with CUDA 12.0 to support large Hopper/Ada GPUs
-        hw_mask->lower = mask;
+        if (hw_mask) {
+                hw_mask->upper = mask >> 32;
+                hw_mask->lower = mask;
+        } else if (hw_mask_v2) {
+                hw_mask_v2->enabled = 1;
+                hw_mask_v2->mask[0] = mask;
+                hw_mask_v2->mask[1] = mask >> 32;
+                hw_mask_v2->mask[2] = mask >> 64;
+                hw_mask_v2->mask[3] = mask >> 96;
+        } else {
+                abort(1, 0, "Stream masking unsupported on this CUDA version (%d), and"
+                            " no fallback MASK_OFF set!", ver);
+        }
 }
 /* INFORMATIONAL FUNCTIONS */
diff --git a/libsmctrl.h b/libsmctrl.h
index 990d434..a8207b4 100644
--- a/libsmctrl.h
+++ b/libsmctrl.h
@@ -9,6 +9,7 @@ extern "C" {
 #endif
 #include <stdint.h>
+typedef unsigned __int128 uint128_t;
 /* PARTITIONING FUNCTIONS */
@@ -22,6 +23,7 @@ extern void libsmctrl_set_global_mask(uint64_t mask);
 // @param mask   A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
 // Supported: CUDA 8.0 - CUDA 12.1
 extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
+extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask);
 // Set TPC mask for the next kernel launch from the caller's CPU thread
 // (overrides global and per-stream masks, applies only to next launch).
 // @param mask   A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
author	Joshua Bakita <bakitajoshua@gmail.com>	2023-11-29 17:52:28 -0500
committer	Joshua Bakita <bakitajoshua@gmail.com>	2023-11-29 18:24:14 -0500
commit	3c075c8f71a7c85d735018143fc13a6eb91813eb (patch)
tree	ad31bcb0f409364622c964e5f6d200201287ba5a
parent	3ee974590403730f2fea911a2574d335cedc4fab (diff)

diff --git a/libsmctrl.c b/libsmctrl.c index f932b5f..526331f 100644 --- a/libsmctrl.c +++ b/libsmctrl.c
@@ -30,6 +30,8 @@
30		30
31	#include <dlfcn.h>	31	#include <dlfcn.h>
32		32
		33	#include "libsmctrl.h"
		34
33	// In functions that do not return an error code, we favor terminating with an	35	// In functions that do not return an error code, we favor terminating with an
34	// error rather than merely printing a warning and continuing.	36	// error rather than merely printing a warning and continuing.
35	#define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \	37	#define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \
@@ -235,28 +237,48 @@ void libsmctrl_set_next_mask(uint64_t mask) {
235		237
236	/* Per-Stream SM Mask (unlikely to be forward-compatible) */	238	/* Per-Stream SM Mask (unlikely to be forward-compatible) */
237		239
		240	// Offsets for the stream struct on x86_64
238	#define CU_8_0_MASK_OFF 0xec	241	#define CU_8_0_MASK_OFF 0xec
239	#define CU_9_0_MASK_OFF 0x130	242	#define CU_9_0_MASK_OFF 0x130
240	#define CU_9_0_MASK_OFF_TX2 0x128 // CUDA 9.0 is slightly different on the TX2
241	// CUDA 9.0 and 9.1 use the same offset	243	// CUDA 9.0 and 9.1 use the same offset
		244	// 9.1 tested on 390.157
242	#define CU_9_2_MASK_OFF 0x140	245	#define CU_9_2_MASK_OFF 0x140
243	#define CU_10_0_MASK_OFF 0x24c	246	#define CU_10_0_MASK_OFF 0x244
244	// CUDA 10.0, 10.1 and 10.2 use the same offset	247	// CUDA 10.0, 10.1 and 10.2 use the same offset
		248	// 10.1 tested on 418.113
		249	// 10.2 tested on 440.100, 440.82, 440.64, and 440.36
245	#define CU_11_0_MASK_OFF 0x274	250	#define CU_11_0_MASK_OFF 0x274
246	#define CU_11_1_MASK_OFF 0x2c4	251	#define CU_11_1_MASK_OFF 0x2c4
247	#define CU_11_2_MASK_OFF 0x37c	252	#define CU_11_2_MASK_OFF 0x37c
248	// CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset	253	// CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset
		254	// 11.4 tested on 470.223.02
249	#define CU_11_6_MASK_OFF 0x38c	255	#define CU_11_6_MASK_OFF 0x38c
250	#define CU_11_7_MASK_OFF 0x3c4	256	#define CU_11_7_MASK_OFF 0x3c4
251	#define CU_11_8_MASK_OFF 0x47c	257	#define CU_11_8_MASK_OFF 0x47c
		258	// 11.8 tested on 520.56.06
252	#define CU_12_0_MASK_OFF 0x4cc	259	#define CU_12_0_MASK_OFF 0x4cc
253	// CUDA 12.0 and 12.1 use the same offset	260	// CUDA 12.0 and 12.1 use the same offset
		261	// 12.0 tested on 525.147.05
		262	#define CU_12_2_MASK_OFF 0x4e4
		263	// 12.2 tested on 535.129.03
		264
		265	// Offsets for the stream struct on aarch64
		266	// All tested on Nov 13th, 2023
		267	#define CU_9_0_MASK_OFF_JETSON 0x128 // Tested on TX2
		268	#define CU_10_2_MASK_OFF_JETSON 0x24c // Tested on TX2 and Jetson Xavier
		269	#define CU_11_4_MASK_OFF_JETSON 0x394 // Tested on Jetson Orin
254		270
255	// Layout in CUDA's `stream` struct	271	// Used up through CUDA 11.8 in the stream struct
256	struct stream_sm_mask {	272	struct stream_sm_mask {
257	uint32_t upper;	273	uint32_t upper;
258	uint32_t lower;	274	uint32_t lower;
259	} __attribute__((packed));	275	};
		276
		277	// Used starting with CUDA 12.0 in the stream struct
		278	struct stream_sm_mask_v2 {
		279	uint32_t enabled;
		280	uint32_t mask[4];
		281	};
260		282
261	// Check if this system has a Parker SoC (TX2/PX2 chip)	283	// Check if this system has a Parker SoC (TX2/PX2 chip)
262	// (CUDA 9.0 behaves slightly different on this platform.)	284	// (CUDA 9.0 behaves slightly different on this platform.)
@@ -286,36 +308,29 @@ int detect_parker_soc() {
286	}	308	}
287	#endif // __aarch64__	309	#endif // __aarch64__
288		310
289	// Should work for CUDA 8.0 through 12.1	311	// Should work for CUDA 8.0 through 12.2
290	// A cudaStream_t is a CUstream. We use void to avoid a cuda.h dependency in	312	// A cudaStream_t is a CUstream. We use void to avoid a cuda.h dependency in
291	// our header	313	// our header
292	void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {	314	void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
		315	uint128_t full_mask = -1;
		316	full_mask <<= 64;
		317	full_mask \|= mask;
		318	libsmctrl_set_stream_mask_ext(stream, full_mask);
		319	}
		320
		321	void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
293	char* stream_struct_base = (char*)stream;	322	char* stream_struct_base = (char*)stream;
294	struct stream_sm_mask* hw_mask;	323	struct stream_sm_mask* hw_mask = NULL;
		324	struct stream_sm_mask_v2* hw_mask_v2 = NULL;
295	int ver;	325	int ver;
296	cuDriverGetVersion(&ver);	326	cuDriverGetVersion(&ver);
297	switch (ver) {	327	switch (ver) {
		328	#if __x86_64__
298	case 8000:	329	case 8000:
299	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF);	330	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF);
300	case 9000:	331	case 9000:
301	case 9010: {	332	case 9010: {
302	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF);	333	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF);
303	#if __aarch64__
304	// Jetson TX2 offset is slightly different on CUDA 9.0.
305	// Only compile the check into ARM64 builds.
306	int is_parker;
307	const char* err_str;
308	if ((is_parker = detect_parker_soc()) < 0) {
309	cuGetErrorName(-is_parker, &err_str);
310	fprintf(stderr, "libsmctrl_set_stream_mask: CUDA call "
311	"failed while doing compatibilty test."
312	"Error, '%s'. Not applying stream "
313	"mask.\n", err_str);
314	}
315
316	if (is_parker)
317	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_TX2);
318	#endif
319	break;	334	break;
320	}	335	}
321	case 9020:	336	case 9020:
@@ -349,25 +364,66 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
349	break;	364	break;
350	case 12000:	365	case 12000:
351	case 12010:	366	case 12010:
352	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF);	367	hw_mask_v2 = (void*)(stream_struct_base + CU_12_0_MASK_OFF);
		368	break;
		369	case 12020:
		370	hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF);
		371	break;
		372	#elif __aarch64__
		373	case 9000: {
		374	// Jetson TX2 offset is slightly different on CUDA 9.0.
		375	// Only compile the check into ARM64 builds.
		376	// TODO: Always verify Jetson-board-only on aarch64.
		377	int is_parker;
		378	const char* err_str;
		379	if ((is_parker = detect_parker_soc()) < 0) {
		380	cuGetErrorName(-is_parker, &err_str);
		381	abort(1, 0, "While performing platform-specific "
		382	"compatibility checks for stream masking, "
		383	"CUDA call failed with error '%s'.", err_str);
		384	}
		385
		386	if (!is_parker)
		387	abort(1, 0, "Not supported on non-Jetson aarch64.");
		388	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_JETSON);
353	break;	389	break;
354	default: {	390	}
355	// For experimenting to determine the right mask offset, set the MASK_OFF	391	case 10020:
356	// environment variable (positive and negative numbers are supported)	392	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_2_MASK_OFF_JETSON);
357	char* mask_off_str = getenv("MASK_OFF");	393	break;
358	fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver);	394	case 11040:
359	if (mask_off_str) {	395	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON);
360	int off = atoi(mask_off_str);	396	break;
361	fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.1 base %#x "	397	#endif
362	"(total off: %#x)\n", off, CU_12_0_MASK_OFF, CU_12_0_MASK_OFF+off);	398	}
363	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF + off);	399
364	} else {	400	// For experimenting to determine the right mask offset, set the MASK_OFF
365	return;	401	// environment variable (positive and negative numbers are supported)
366	}}	402	char* mask_off_str = getenv("MASK_OFF");
		403	if (mask_off_str) {
		404	int off = atoi(mask_off_str);
		405	fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.2 base %#x "
		406	"(total off: %#x)\n", off, CU_12_2_MASK_OFF, CU_12_2_MASK_OFF + off);
		407	if (CU_12_2_MASK_OFF + off < 0)
		408	abort(1, 0, "Total offset cannot be less than 0! Aborting...");
		409	// +4 bytes to convert a mask found with this for use with hw_mask
		410	hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF + off);
367	}	411	}
368		412
369	hw_mask->upper = mask >> 32;	413	// Mask layout changed with CUDA 12.0 to support large Hopper/Ada GPUs
370	hw_mask->lower = mask;	414	if (hw_mask) {
		415	hw_mask->upper = mask >> 32;
		416	hw_mask->lower = mask;
		417	} else if (hw_mask_v2) {
		418	hw_mask_v2->enabled = 1;
		419	hw_mask_v2->mask[0] = mask;
		420	hw_mask_v2->mask[1] = mask >> 32;
		421	hw_mask_v2->mask[2] = mask >> 64;
		422	hw_mask_v2->mask[3] = mask >> 96;
		423	} else {
		424	abort(1, 0, "Stream masking unsupported on this CUDA version (%d), and"
		425	" no fallback MASK_OFF set!", ver);
		426	}
371	}	427	}
372		428
373	/* INFORMATIONAL FUNCTIONS */	429	/* INFORMATIONAL FUNCTIONS */


diff --git a/libsmctrl.h b/libsmctrl.h index 990d434..a8207b4 100644 --- a/libsmctrl.h +++ b/libsmctrl.h
@@ -9,6 +9,7 @@ extern "C" {
9	#endif	9	#endif
10		10
11	#include <stdint.h>	11	#include <stdint.h>
		12	typedef unsigned __int128 uint128_t;
12		13
13	/* PARTITIONING FUNCTIONS */	14	/* PARTITIONING FUNCTIONS */
14		15
@@ -22,6 +23,7 @@ extern void libsmctrl_set_global_mask(uint64_t mask);
22	// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)	23	// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
23	// Supported: CUDA 8.0 - CUDA 12.1	24	// Supported: CUDA 8.0 - CUDA 12.1
24	extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);	25	extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
		26	extern void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask);
25	// Set TPC mask for the next kernel launch from the caller's CPU thread	27	// Set TPC mask for the next kernel launch from the caller's CPU thread
26	// (overrides global and per-stream masks, applies only to next launch).	28	// (overrides global and per-stream masks, applies only to next launch).
27	// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)	29	// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)