Initial reimplementation of libsmctrl as a library

- Tested working with cuda_scheduling_examiner - Supports everything described in the accepted RTAS'23 paper - Can be used as either a shared or staticly-linked library - Documented in libsmctrl.h
author: Joshua Bakita <bakitajoshua@gmail.com> 2023-03-02 22:14:22 -0500
committer: Joshua Bakita <bakitajoshua@gmail.com> 2023-03-02 22:14:22 -0500
commit: 7db0d3088a6e25c7c64999a20267f55751571dee (patch)
tree: 9867d0ede3818ade3a63f942446b40d2f1446254
5 files changed, 433 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..dcff266
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+libsmctrl.a
+libsmctrl.o
+libsmctrl.so
+libsmctrl_test_gpc_info
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..aa59792
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,21 @@
+CC = gcc
+# -fPIC is needed in all cases, as we may be linked into another shared library
+CFLAGS = -fPIC
+LDFLAGS = -lcuda -I/usr/local/cuda/include
+.PHONY: clean tests
+libsmctrl.so: libsmctrl.c libsmctrl.h
+        $(CC) $< -shared -o $@ $(CFLAGS) $(LDFLAGS)
+libsmctrl.a: libsmctrl.c libsmctrl.h
+        $(CC) $< -c -o libsmctrl.o $(CFLAGS) $(LDFLAGS)
+        ar rcs $@ libsmctrl.o
+libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c
+        $(CC) $< -o $@ -L. -lsmctrl $(LDFLAGS)
+tests: libsmctrl_test_gpc_info
+clean:
+        rm -f libsmctrl.so libsmctrl.a
diff --git a/libsmctrl.c b/libsmctrl.c
new file mode 100644
index 0000000..69b19a1
--- /dev/null
+++ b/libsmctrl.c
@@ -0,0 +1,332 @@
+/**
+ * Copyright 2022 Joshua Bakita
+ * Library to control SM masks on CUDA launches. Co-opts preexisting debug
+ * logic in the CUDA driver library, and thus requires a build with -lcuda.
+ */
+//#include "/playpen/playpen/cuda-11.8/include/cuda.h"
+#include <cuda.h>
+//#include <cuda_runtime.h>
+//#ifndef CUDA_VERSION
+//#warning libsmctrl: CUDA driver library must be included before libsmctrl.h.
+//#endif
+#include <stdint.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+// Layout of mask control fields in CUDA's `globals` struct
+struct global_sm_control {
+        uint32_t enabled;
+        uint64_t mask;
+} __attribute__((packed));
+/*** CUDA Globals Manipulation. CUDA 10.2 only ***/
+// Ends up being 0x7fb7fa3408 in some binaries
+static struct global_sm_control* g_sm_control = NULL;
+/* Find the location of CUDA's `globals` struct and the SM mask control fields
+ * No symbols are exported from within `globals`, so this has to do a very
+ * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`.
+ * Don't call this before the cuda library has been initialized.
+ */
+static void setup_sm_control_10() {
+        if (g_sm_control)
+                return;
+        // Defeat relocation. cudbgReportDriverApiErrorFlags is relocated by
+        // the loader, but not subject to ASLR (it's always at a constant
+        // offset in the loaded instance of libcuda.so). Our target is also at
+        // a constant offset, so we can use the address of
+        // cudbgReportDriverApiErrorFlags as a reference point.
+        // Note: cudbgReportDriverApiErrorFlags is currently the closest known
+        // symbol to **the table**. cudbgDebuggerInitialized is the closest to
+        // globals itself (+7424 == SM mask control), but we perfer the table
+        // lookup approach for now, as that's what cuDeviceGetCount() does.
+        extern uint32_t cudbgReportDriverApiErrorFlags;
+        uint32_t* sym = 0;//&cudbgReportDriverApiErrorFlags;
+        // In some binaries, the following works out to 0x7fb7ea6000, and
+        // that's what shows up in the adrp instruction in cuDeviceGetCount()
+        // in the lead-up to get globals.numDevices. Find this offset by
+        // calling cuDeviceGetCount(0xdeadbeef), catching the segfault in GDB,
+        // disassembling the prior instructions, taking the adrp constant, and
+        // subtracting the address of cudbgReportDriverApiErrorFlags from it.
+        uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868);
+        // Address of `globals` is at offset 3672 (entry 459?)
+        uintptr_t globals_addr = *(tbl_base + 459); // Offset 3672 on aarch64
+        // SM mask control is at offset 4888 in the `globals` struct
+        g_sm_control = (struct global_sm_control*)(globals_addr + 4888);
+        // SM mask should be empty by default
+        if (g_sm_control->enabled || g_sm_control->mask)
+                fprintf(stderr, "Warning: Found non-NULL SM disable mask during setup! g_sm_control is likely invalid---use at own risk.\n");
+}
+/*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/
+// Tested working on CUDA x86_64 11.0-11.8.
+// Tested not working on aarch64 or x86_64 10.2
+static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b};
+#define LAUNCH_DOMAIN 0x3
+#define LAUNCH_PRE_UPLOAD 0x3
+static uint64_t g_sm_mask = 0;
+static __thread uint64_t g_next_sm_mask = 0;
+static char sm_control_setup_called = 0;
+static void launchCallback(void *ukwn, int domain, int cbid, const void *in_params) {
+        if (*(uint32_t*)in_params < 0x50) {
+                fprintf(stderr, "Unsupported CUDA version for callback-based SM masking. Aborting...\n");
+                return;
+        }
+        if (!**((uintptr_t***)in_params+8)) {
+                fprintf(stderr, "Called with NULL halLaunchDataAllocation\n");
+                return;
+        }
+        //fprintf(stderr, "cta: %lx\n", *(uint64_t*)(**((char***)in_params + 8) + 74));
+        // TODO: Check for supported QMD version (>XXX, <4.00)
+        // TODO: Support QMD version 4 (Hopper), where offset starts at +304 (rather than +84) and is 72 bytes (rather than 8 bytes) wide
+        uint32_t *lower_ptr = (uint32_t*)(**((char***)in_params + 8) + 84);
+        uint32_t *upper_ptr = (uint32_t*)(**((char***)in_params + 8) + 88);
+        if (g_next_sm_mask) {
+                *lower_ptr = (uint32_t)g_next_sm_mask;
+                *upper_ptr = (uint32_t)(g_next_sm_mask >> 32);
+                g_next_sm_mask = 0;
+        } else if (!*lower_ptr && !*upper_ptr){
+                // Only apply the global mask if a per-stream mask hasn't been set
+                *lower_ptr = (uint32_t)g_sm_mask;
+                *upper_ptr = (uint32_t)(g_sm_mask >> 32);
+        }
+        //fprintf(stderr, "lower mask: %x\n", *lower_ptr);
+        //fprintf(stderr, "upper mask: %x\n", *upper_ptr);
+}
+static void setup_sm_control_11() {
+        int (*subscribe)(uint32_t* hndl, void(*callback)(void*, int, int, const void*), void* ukwn);
+        int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid);
+        uintptr_t* tbl_base;
+        uint32_t my_hndl;
+        // Avoid race conditions (setup can only be called once)
+        if (__atomic_test_and_set(&sm_control_setup_called, __ATOMIC_SEQ_CST))
+                return;
+        cuGetExportTable((const void**)&tbl_base, &callback_funcs_id);
+        uintptr_t subscribe_func_addr = *(tbl_base + 3);
+        uintptr_t enable_func_addr = *(tbl_base + 6);
+        subscribe = (typeof(subscribe))subscribe_func_addr;
+        enable = (typeof(enable))enable_func_addr;
+        int res = 0;
+        res = subscribe(&my_hndl, launchCallback, NULL);
+        if (res) {
+                fprintf(stderr, "libsmctrl: Error subscribing to launch callback. Error %d\n", res);
+                return;
+        }
+        res = enable(1, my_hndl, LAUNCH_DOMAIN, LAUNCH_PRE_UPLOAD);
+        if (res)
+                fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res);
+}
+// Common masking control
+void libsmctrl_set_global_mask(uint64_t mask) {
+        int ver;
+        cuDriverGetVersion(&ver);
+        if (ver <= 10020) {
+                if (!g_sm_control)
+                        setup_sm_control_10();
+                g_sm_control->mask = mask;
+                g_sm_control->enabled = 1;
+        } else {
+                if (!sm_control_setup_called)
+                        setup_sm_control_11();
+                g_sm_mask = mask;
+        }
+}
+void set_sm_mask(uint64_t mask) {
+        libsmctrl_set_global_mask(mask);
+}
+// Set mask for next launch from this thread
+void libsmctrl_set_next_mask(uint64_t mask) {
+        if (!sm_control_setup_called)
+                setup_sm_control_11();
+        g_next_sm_mask = mask;
+}
+/*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/
+#define CU_8_0_MASK_OFF 0xec
+#define CU_9_0_MASK_OFF 0x130
+// CUDA 9.0 and 9.1 use the same offset
+#define CU_9_2_MASK_OFF 0x140
+#define CU_10_0_MASK_OFF 0x24c
+// CUDA 10.0, 10.1 and 10.2 use the same offset
+#define CU_11_0_MASK_OFF 0x274
+#define CU_11_1_MASK_OFF 0x2c4
+#define CU_11_2_MASK_OFF 0x37c
+// CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset
+#define CU_11_6_MASK_OFF 0x38c
+#define CU_11_7_MASK_OFF 0x3c4
+#define CU_11_8_MASK_OFF 0x47c
+#define CU_12_0_MASK_OFF 0x4cc
+// CUDA 12.0 and 12.1 use the same offset
+// Layout in CUDA's `stream` struct
+struct stream_sm_mask {
+        uint32_t upper;
+        uint32_t lower;
+} __attribute__((packed));
+// Should work for CUDA 9.1, 10.0-11.8, 12.0-12.1
+// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in
+// our header
+void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
+        char* stream_struct_base = *(char**)stream;
+        struct stream_sm_mask* hw_mask;
+        int ver;
+        cuDriverGetVersion(&ver);
+        switch (ver) {
+        case 8000:
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF);
+        case 9000:
+        case 9010:
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF);
+                break;
+        case 9020:
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF);
+                break;
+        case 10000:
+        case 10010:
+        case 10020:
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_0_MASK_OFF);
+                break;
+        case 11000:
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_0_MASK_OFF);
+                break;
+        case 11010:
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_1_MASK_OFF);
+                break;
+        case 11020:
+        case 11030:
+        case 11040:
+        case 11050:
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_2_MASK_OFF);
+                break;
+        case 11060:
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_6_MASK_OFF);
+                break;
+        case 11070:
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_7_MASK_OFF);
+                break;
+        case 11080:
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF);
+                break;
+        case 12000:
+        case 12010:
+                hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF);
+                break;
+        default: {
+                // For experimenting to determine the right mask offset, set MASK_OFF (positive and negative numbers supported)
+                char* mask_off_str = getenv("MASK_OFF");
+                fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver);
+                if (mask_off_str) {
+                        int off = atoi(mask_off_str);
+                        fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 11.8 base %#x (total off: %#x)\n", off, CU_11_8_MASK_OFF, CU_11_8_MASK_OFF+off);
+                        hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF + off);
+                } else {
+                        return;
+                }}
+        }
+        hw_mask->upper = mask >> 32;
+        hw_mask->lower = mask;
+}
+int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {
+        int num_sms;
+        int major;
+        int minor;
+        // TODO: Use nvdebug instead of this hardcoded hack
+        cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
+        cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
+        cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
+        // SM masking only works on sm_35+
+        if (major < 3 || (major == 3 && minor < 5))
+                return -ENOTSUP;
+        // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
+        // as the P100, which is uniquely sm_60
+        int sms_per_tpc;
+        if (major > 6 || (major == 6 && minor == 0))
+                sms_per_tpc = 2;
+        else
+                sms_per_tpc = 1;
+        // It looks like there may be some upcoming weirdness (TPCs with only one SM?)
+        // with Hopper
+        if (major >= 9)
+                fprintf(stderr, "libsmctrl: WARNING, SM masking is untested on Hopper, and will likely yield incorrect results! Proceed with caution.\n");
+        *num_tpcs = num_sms/sms_per_tpc;
+        return 0;
+}
+// Read an integer from a file in `/proc`
+static int read_int_procfile(char* filename, uint64_t* out) {
+        char f_data[18] = {0};
+        int fd = open(filename, O_RDONLY);
+        if (fd == -1)
+                return -errno;
+        read(fd, f_data, 18);
+        close(fd);
+        *out = strtoll(f_data, NULL, 16);
+        return 0;
+}
+static uint64_t tpc_mask_per_gpc_per_dev[16][12];
+int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) {
+        uint32_t i, j, vtpc_idx = 0;
+        uint64_t gpc_mask, num_tpc_per_gpc, max_gpcs, gpc_tpc_mask;
+        int err;
+        char filename[100];
+        *num_enabled_gpcs = 0;
+        // Maximum number of GPCs supported for this chip
+        snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev);
+        if (err = read_int_procfile(filename, &max_gpcs)) {
+                fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before using libsmctrl_get_gpc_info()\n");
+                return err;
+        }
+        // TODO: handle arbitrary-size GPUs
+        if (dev > 16 || max_gpcs > 12) {
+                fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n");
+                return -ERANGE;
+        }
+        // Set bit = disabled GPC
+        snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev);
+        if (err = read_int_procfile(filename, &gpc_mask))
+                return err;
+        snprintf(filename, 100, "/proc/gpu%d/num_tpc_per_gpc", dev);
+        if (err = read_int_procfile(filename, &num_tpc_per_gpc))
+                return err;
+        // For each enabled GPC
+        for (i = 0; i < max_gpcs; i++) {
+                // Skip this GPC if disabled
+                if ((1 << i) & gpc_mask)
+                        continue;
+                (*num_enabled_gpcs)++;
+                // Get the bitstring of TPCs disabled for this GPC
+                // Set bit = disabled TPC
+                snprintf(filename, 100, "/proc/gpu%d/gpc%d_tpc_mask", dev, i);
+                if (err = read_int_procfile(filename, &gpc_tpc_mask))
+                        return err;
+                uint64_t* tpc_mask = &tpc_mask_per_gpc_per_dev[dev][*num_enabled_gpcs - 1];
+                *tpc_mask = 0;
+                for (j = 0; j < num_tpc_per_gpc; j++) {
+                                // Skip disabled TPCs
+                                if ((1 << j) & gpc_tpc_mask)
+                                        continue;
+                                *tpc_mask |= (1 << vtpc_idx);
+                                vtpc_idx++;
+                }
+        }
+        *tpcs_for_gpc = tpc_mask_per_gpc_per_dev[dev];
+        return 0;
+}
diff --git a/libsmctrl.h b/libsmctrl.h
new file mode 100644
index 0000000..7be425d
--- /dev/null
+++ b/libsmctrl.h
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2022 Joshua Bakita
+ * Library to control TPC masks on CUDA launches. Co-opts preexisting debug
+ * logic in the CUDA driver library, and thus requires a build with -lcuda.
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* PARTITIONING FUNCTIONS */
+// Set global default TPC mask for all kernels, incl. CUDA-internal ones
+// @param mask   A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
+// Supported: CUDA 10.2, and CUDA 11.0 - CUDA 11.8
+extern void libsmctrl_set_global_mask(uint64_t mask);
+// Set default TPC mask for all kernels launched via `stream`
+// (overrides global mask)
+// @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on
+// @param mask   A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
+// Supported: CUDA 8.0 - CUDA 11.8
+extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
+// Set TPC mask for the next kernel launch from the caller's CPU thread
+// (overrides global and per-stream masks, applies only to next launch).
+// @param mask   A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
+// Supported: CUDA 11.0 - CUDA 11.8
+extern void libsmctrl_set_next_mask(uint64_t mask);
+// **DEPRECATED**: Old name for libsmctrl_set_global_mask()
+extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_set_global_mask()")));
+/**
+ * Notes on Bitmasks
+ *
+ * All of the core partitioning functions take a `uint64_t mask` parameter. A
+ * set bit in the mask indicates that the respective Thread Processing Cluster
+ * (TPC) is to be __disabled__.
+ *
+ * Examples
+ * To prohibit the next kernel from using TPC 0:
+ *     libsmctrl_set_next_mask(0x1);
+ * Allow kernels to only use TPC 0 by default:
+ *     libsmctrl_set_global_mask(~0x1ull);
+ * Allow kernels in a stream to only use TPCs 2, 3, and 4:
+ *     libsmctrl_set_stream_mask(stream, ~0b00111100ull);
+ *
+ * Note that the bitwise inversion operator (~, as used above) is very useful,
+ * just be sure to apply it to 64-bit integer literals only! (~0x1 != ~0x1ull)
+ */
+/* INFORMATIONAL FUNCTIONS */
+// Get total number of TPCs on device number `dev`.
+extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev);
+// Get number of GPCs for devices number `dev`, and a GPC-indexed array
+// containing masks of which TPCs are associated with each GPC.
+// Note that the `nvdebug` module must be loaded to use this function.
+extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev);
+#ifdef __cplusplus
+}
+#endif
diff --git a/libsmctrl_test_gpc_info.c b/libsmctrl_test_gpc_info.c
new file mode 100644
index 0000000..93bfc1e
--- /dev/null
+++ b/libsmctrl_test_gpc_info.c
@@ -0,0 +1,14 @@
+#include <stdio.h>
+#include <stdint.h>
+#include "libsmctrl.h"
+int main() {
+        uint32_t num_gpcs;
+        uint64_t* masks;
+        libsmctrl_get_gpc_info(&num_gpcs, &masks, 1);
+        printf("Num GPCs: %d\n", num_gpcs);
+        for (int i = 0; i < num_gpcs; i++) {
+                printf("Mask of TPCs associated with GPC %d: %#lx\n", i, masks[i]);
+        }
+        return 0;
+}
author	Joshua Bakita <bakitajoshua@gmail.com>	2023-03-02 22:14:22 -0500
committer	Joshua Bakita <bakitajoshua@gmail.com>	2023-03-02 22:14:22 -0500
commit	7db0d3088a6e25c7c64999a20267f55751571dee (patch)
tree	9867d0ede3818ade3a63f942446b40d2f1446254

diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dcff266 --- /dev/null +++ b/.gitignore
@@ -0,0 +1,4 @@
	1	libsmctrl.a
	2	libsmctrl.o
	3	libsmctrl.so
	4	libsmctrl_test_gpc_info


diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..aa59792 --- /dev/null +++ b/Makefile
@@ -0,0 +1,21 @@
	1	CC = gcc
	2	# -fPIC is needed in all cases, as we may be linked into another shared library
	3	CFLAGS = -fPIC
	4	LDFLAGS = -lcuda -I/usr/local/cuda/include
	5
	6	.PHONY: clean tests
	7
	8	libsmctrl.so: libsmctrl.c libsmctrl.h
	9	$(CC) $< -shared -o $@ $(CFLAGS) $(LDFLAGS)
	10
	11	libsmctrl.a: libsmctrl.c libsmctrl.h
	12	$(CC) $< -c -o libsmctrl.o $(CFLAGS) $(LDFLAGS)
	13	ar rcs $@ libsmctrl.o
	14
	15	libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c
	16	$(CC) $< -o $@ -L. -lsmctrl $(LDFLAGS)
	17
	18	tests: libsmctrl_test_gpc_info
	19
	20	clean:
	21	rm -f libsmctrl.so libsmctrl.a


diff --git a/libsmctrl.c b/libsmctrl.c new file mode 100644 index 0000000..69b19a1 --- /dev/null +++ b/libsmctrl.c
@@ -0,0 +1,332 @@
	1	/**
	2	* Copyright 2022 Joshua Bakita
	3	* Library to control SM masks on CUDA launches. Co-opts preexisting debug
	4	* logic in the CUDA driver library, and thus requires a build with -lcuda.
	5	*/
	6
	7	//#include "/playpen/playpen/cuda-11.8/include/cuda.h"
	8	#include <cuda.h>
	9	//#include <cuda_runtime.h>
	10	//#ifndef CUDA_VERSION
	11	//#warning libsmctrl: CUDA driver library must be included before libsmctrl.h.
	12	//#endif
	13
	14	#include <stdint.h>
	15	#include <errno.h>
	16	#include <fcntl.h>
	17	#include <unistd.h>
	18	#include <stdio.h>
	19
	20	// Layout of mask control fields in CUDA's `globals` struct
	21	struct global_sm_control {
	22	uint32_t enabled;
	23	uint64_t mask;
	24	} __attribute__((packed));
	25
	26	/* CUDA Globals Manipulation. CUDA 10.2 only */
	27
	28	// Ends up being 0x7fb7fa3408 in some binaries
	29	static struct global_sm_control* g_sm_control = NULL;
	30
	31	/* Find the location of CUDA's `globals` struct and the SM mask control fields
	32	* No symbols are exported from within `globals`, so this has to do a very
	33	* messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`.
	34	* Don't call this before the cuda library has been initialized.
	35	*/
	36	static void setup_sm_control_10() {
	37	if (g_sm_control)
	38	return;
	39	// Defeat relocation. cudbgReportDriverApiErrorFlags is relocated by
	40	// the loader, but not subject to ASLR (it's always at a constant
	41	// offset in the loaded instance of libcuda.so). Our target is also at
	42	// a constant offset, so we can use the address of
	43	// cudbgReportDriverApiErrorFlags as a reference point.
	44	// Note: cudbgReportDriverApiErrorFlags is currently the closest known
	45	// symbol to the table. cudbgDebuggerInitialized is the closest to
	46	// globals itself (+7424 == SM mask control), but we perfer the table
	47	// lookup approach for now, as that's what cuDeviceGetCount() does.
	48	extern uint32_t cudbgReportDriverApiErrorFlags;
	49	uint32_t* sym = 0;//&cudbgReportDriverApiErrorFlags;
	50	// In some binaries, the following works out to 0x7fb7ea6000, and
	51	// that's what shows up in the adrp instruction in cuDeviceGetCount()
	52	// in the lead-up to get globals.numDevices. Find this offset by
	53	// calling cuDeviceGetCount(0xdeadbeef), catching the segfault in GDB,
	54	// disassembling the prior instructions, taking the adrp constant, and
	55	// subtracting the address of cudbgReportDriverApiErrorFlags from it.
	56	uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868);
	57	// Address of `globals` is at offset 3672 (entry 459?)
	58	uintptr_t globals_addr = *(tbl_base + 459); // Offset 3672 on aarch64
	59	// SM mask control is at offset 4888 in the `globals` struct
	60	g_sm_control = (struct global_sm_control*)(globals_addr + 4888);
	61	// SM mask should be empty by default
	62	if (g_sm_control->enabled \|\| g_sm_control->mask)
	63	fprintf(stderr, "Warning: Found non-NULL SM disable mask during setup! g_sm_control is likely invalid---use at own risk.\n");
	64	}
	65
	66	/* QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ */
	67
	68	// Tested working on CUDA x86_64 11.0-11.8.
	69	// Tested not working on aarch64 or x86_64 10.2
	70	static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b};
	71	#define LAUNCH_DOMAIN 0x3
	72	#define LAUNCH_PRE_UPLOAD 0x3
	73	static uint64_t g_sm_mask = 0;
	74	static __thread uint64_t g_next_sm_mask = 0;
	75	static char sm_control_setup_called = 0;
	76	static void launchCallback(void ukwn, int domain, int cbid, const void in_params) {
	77	if ((uint32_t)in_params < 0x50) {
	78	fprintf(stderr, "Unsupported CUDA version for callback-based SM masking. Aborting...\n");
	79	return;
	80	}
	81	if (!((uintptr_t*)in_params+8)) {
	82	fprintf(stderr, "Called with NULL halLaunchDataAllocation\n");
	83	return;
	84	}
	85	//fprintf(stderr, "cta: %lx\n", (uint64_t)(((char*)in_params + 8) + 74));
	86	// TODO: Check for supported QMD version (>XXX, <4.00)
	87	// TODO: Support QMD version 4 (Hopper), where offset starts at +304 (rather than +84) and is 72 bytes (rather than 8 bytes) wide
	88	uint32_t lower_ptr = (uint32_t)(((char*)in_params + 8) + 84);
	89	uint32_t upper_ptr = (uint32_t)(((char*)in_params + 8) + 88);
	90	if (g_next_sm_mask) {
	91	*lower_ptr = (uint32_t)g_next_sm_mask;
	92	*upper_ptr = (uint32_t)(g_next_sm_mask >> 32);
	93	g_next_sm_mask = 0;
	94	} else if (!lower_ptr && !upper_ptr){
	95	// Only apply the global mask if a per-stream mask hasn't been set
	96	*lower_ptr = (uint32_t)g_sm_mask;
	97	*upper_ptr = (uint32_t)(g_sm_mask >> 32);
	98	}
	99	//fprintf(stderr, "lower mask: %x\n", *lower_ptr);
	100	//fprintf(stderr, "upper mask: %x\n", *upper_ptr);
	101	}
	102
	103	static void setup_sm_control_11() {
	104	int (subscribe)(uint32_t hndl, void(callback)(void, int, int, const void), void ukwn);
	105	int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid);
	106	uintptr_t* tbl_base;
	107	uint32_t my_hndl;
	108	// Avoid race conditions (setup can only be called once)
	109	if (__atomic_test_and_set(&sm_control_setup_called, __ATOMIC_SEQ_CST))
	110	return;
	111
	112	cuGetExportTable((const void**)&tbl_base, &callback_funcs_id);
	113	uintptr_t subscribe_func_addr = *(tbl_base + 3);
	114	uintptr_t enable_func_addr = *(tbl_base + 6);
	115	subscribe = (typeof(subscribe))subscribe_func_addr;
	116	enable = (typeof(enable))enable_func_addr;
	117	int res = 0;
	118	res = subscribe(&my_hndl, launchCallback, NULL);
	119	if (res) {
	120	fprintf(stderr, "libsmctrl: Error subscribing to launch callback. Error %d\n", res);
	121	return;
	122	}
	123	res = enable(1, my_hndl, LAUNCH_DOMAIN, LAUNCH_PRE_UPLOAD);
	124	if (res)
	125	fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res);
	126	}
	127
	128	// Common masking control
	129	void libsmctrl_set_global_mask(uint64_t mask) {
	130	int ver;
	131	cuDriverGetVersion(&ver);
	132	if (ver <= 10020) {
	133	if (!g_sm_control)
	134	setup_sm_control_10();
	135	g_sm_control->mask = mask;
	136	g_sm_control->enabled = 1;
	137	} else {
	138	if (!sm_control_setup_called)
	139	setup_sm_control_11();
	140	g_sm_mask = mask;
	141	}
	142	}
	143
	144	void set_sm_mask(uint64_t mask) {
	145	libsmctrl_set_global_mask(mask);
	146	}
	147
	148	// Set mask for next launch from this thread
	149	void libsmctrl_set_next_mask(uint64_t mask) {
	150	if (!sm_control_setup_called)
	151	setup_sm_control_11();
	152	g_next_sm_mask = mask;
	153	}
	154
	155
	156	/* Per-Stream SM Mask (unlikely to be forward-compatible) */
	157
	158	#define CU_8_0_MASK_OFF 0xec
	159	#define CU_9_0_MASK_OFF 0x130
	160	// CUDA 9.0 and 9.1 use the same offset
	161	#define CU_9_2_MASK_OFF 0x140
	162	#define CU_10_0_MASK_OFF 0x24c
	163	// CUDA 10.0, 10.1 and 10.2 use the same offset
	164	#define CU_11_0_MASK_OFF 0x274
	165	#define CU_11_1_MASK_OFF 0x2c4
	166	#define CU_11_2_MASK_OFF 0x37c
	167	// CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset
	168	#define CU_11_6_MASK_OFF 0x38c
	169	#define CU_11_7_MASK_OFF 0x3c4
	170	#define CU_11_8_MASK_OFF 0x47c
	171	#define CU_12_0_MASK_OFF 0x4cc
	172	// CUDA 12.0 and 12.1 use the same offset
	173
	174	// Layout in CUDA's `stream` struct
	175	struct stream_sm_mask {
	176	uint32_t upper;
	177	uint32_t lower;
	178	} __attribute__((packed));
	179
	180	// Should work for CUDA 9.1, 10.0-11.8, 12.0-12.1
	181	// A cudaStream_t is a CUstream. We use void to avoid a cuda.h dependency in
	182	// our header
	183	void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
	184	char* stream_struct_base = (char*)stream;
	185	struct stream_sm_mask* hw_mask;
	186	int ver;
	187	cuDriverGetVersion(&ver);
	188	switch (ver) {
	189	case 8000:
	190	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF);
	191	case 9000:
	192	case 9010:
	193	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF);
	194	break;
	195	case 9020:
	196	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF);
	197	break;
	198	case 10000:
	199	case 10010:
	200	case 10020:
	201	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_0_MASK_OFF);
	202	break;
	203	case 11000:
	204	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_0_MASK_OFF);
	205	break;
	206	case 11010:
	207	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_1_MASK_OFF);
	208	break;
	209	case 11020:
	210	case 11030:
	211	case 11040:
	212	case 11050:
	213	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_2_MASK_OFF);
	214	break;
	215	case 11060:
	216	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_6_MASK_OFF);
	217	break;
	218	case 11070:
	219	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_7_MASK_OFF);
	220	break;
	221	case 11080:
	222	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF);
	223	break;
	224	case 12000:
	225	case 12010:
	226	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_12_0_MASK_OFF);
	227	break;
	228	default: {
	229	// For experimenting to determine the right mask offset, set MASK_OFF (positive and negative numbers supported)
	230	char* mask_off_str = getenv("MASK_OFF");
	231	fprintf(stderr, "libsmctrl: Stream masking unsupported on this CUDA version (%d)!\n", ver);
	232	if (mask_off_str) {
	233	int off = atoi(mask_off_str);
	234	fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 11.8 base %#x (total off: %#x)\n", off, CU_11_8_MASK_OFF, CU_11_8_MASK_OFF+off);
	235	hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF + off);
	236	} else {
	237	return;
	238	}}
	239	}
	240
	241	hw_mask->upper = mask >> 32;
	242	hw_mask->lower = mask;
	243	}
	244
	245	int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {
	246	int num_sms;
	247	int major;
	248	int minor;
	249	// TODO: Use nvdebug instead of this hardcoded hack
	250	cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
	251	cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
	252	cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
	253	// SM masking only works on sm_35+
	254	if (major < 3 \|\| (major == 3 && minor < 5))
	255	return -ENOTSUP;
	256	// Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
	257	// as the P100, which is uniquely sm_60
	258	int sms_per_tpc;
	259	if (major > 6 \|\| (major == 6 && minor == 0))
	260	sms_per_tpc = 2;
	261	else
	262	sms_per_tpc = 1;
	263	// It looks like there may be some upcoming weirdness (TPCs with only one SM?)
	264	// with Hopper
	265	if (major >= 9)
	266	fprintf(stderr, "libsmctrl: WARNING, SM masking is untested on Hopper, and will likely yield incorrect results! Proceed with caution.\n");
	267	*num_tpcs = num_sms/sms_per_tpc;
	268	return 0;
	269	}
	270
	271	// Read an integer from a file in `/proc`
	272	static int read_int_procfile(char* filename, uint64_t* out) {
	273	char f_data[18] = {0};
	274	int fd = open(filename, O_RDONLY);
	275	if (fd == -1)
	276	return -errno;
	277	read(fd, f_data, 18);
	278	close(fd);
	279	*out = strtoll(f_data, NULL, 16);
	280	return 0;
	281	}
	282
	283	static uint64_t tpc_mask_per_gpc_per_dev[16][12];
	284	int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) {
	285	uint32_t i, j, vtpc_idx = 0;
	286	uint64_t gpc_mask, num_tpc_per_gpc, max_gpcs, gpc_tpc_mask;
	287	int err;
	288	char filename[100];
	289	*num_enabled_gpcs = 0;
	290	// Maximum number of GPCs supported for this chip
	291	snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev);
	292	if (err = read_int_procfile(filename, &max_gpcs)) {
	293	fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before using libsmctrl_get_gpc_info()\n");
	294	return err;
	295	}
	296	// TODO: handle arbitrary-size GPUs
	297	if (dev > 16 \|\| max_gpcs > 12) {
	298	fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n");
	299	return -ERANGE;
	300	}
	301	// Set bit = disabled GPC
	302	snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev);
	303	if (err = read_int_procfile(filename, &gpc_mask))
	304	return err;
	305	snprintf(filename, 100, "/proc/gpu%d/num_tpc_per_gpc", dev);
	306	if (err = read_int_procfile(filename, &num_tpc_per_gpc))
	307	return err;
	308	// For each enabled GPC
	309	for (i = 0; i < max_gpcs; i++) {
	310	// Skip this GPC if disabled
	311	if ((1 << i) & gpc_mask)
	312	continue;
	313	(*num_enabled_gpcs)++;
	314	// Get the bitstring of TPCs disabled for this GPC
	315	// Set bit = disabled TPC
	316	snprintf(filename, 100, "/proc/gpu%d/gpc%d_tpc_mask", dev, i);
	317	if (err = read_int_procfile(filename, &gpc_tpc_mask))
	318	return err;
	319	uint64_t* tpc_mask = &tpc_mask_per_gpc_per_dev[dev][*num_enabled_gpcs - 1];
	320	*tpc_mask = 0;
	321	for (j = 0; j < num_tpc_per_gpc; j++) {
	322	// Skip disabled TPCs
	323	if ((1 << j) & gpc_tpc_mask)
	324	continue;
	325	*tpc_mask \|= (1 << vtpc_idx);
	326	vtpc_idx++;
	327	}
	328	}
	329	*tpcs_for_gpc = tpc_mask_per_gpc_per_dev[dev];
	330	return 0;
	331	}
	332


diff --git a/libsmctrl.h b/libsmctrl.h new file mode 100644 index 0000000..7be425d --- /dev/null +++ b/libsmctrl.h
@@ -0,0 +1,62 @@
	1	/**
	2	* Copyright 2022 Joshua Bakita
	3	* Library to control TPC masks on CUDA launches. Co-opts preexisting debug
	4	* logic in the CUDA driver library, and thus requires a build with -lcuda.
	5	*/
	6
	7	#ifdef __cplusplus
	8	extern "C" {
	9	#endif
	10
	11	/* PARTITIONING FUNCTIONS */
	12
	13	// Set global default TPC mask for all kernels, incl. CUDA-internal ones
	14	// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
	15	// Supported: CUDA 10.2, and CUDA 11.0 - CUDA 11.8
	16	extern void libsmctrl_set_global_mask(uint64_t mask);
	17	// Set default TPC mask for all kernels launched via `stream`
	18	// (overrides global mask)
	19	// @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on
	20	// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
	21	// Supported: CUDA 8.0 - CUDA 11.8
	22	extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
	23	// Set TPC mask for the next kernel launch from the caller's CPU thread
	24	// (overrides global and per-stream masks, applies only to next launch).
	25	// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
	26	// Supported: CUDA 11.0 - CUDA 11.8
	27	extern void libsmctrl_set_next_mask(uint64_t mask);
	28
	29	// DEPRECATED: Old name for libsmctrl_set_global_mask()
	30	extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_set_global_mask()")));
	31
	32	/**
	33	* Notes on Bitmasks
	34	*
	35	* All of the core partitioning functions take a `uint64_t mask` parameter. A
	36	* set bit in the mask indicates that the respective Thread Processing Cluster
	37	* (TPC) is to be __disabled__.
	38	*
	39	* Examples
	40	* To prohibit the next kernel from using TPC 0:
	41	* libsmctrl_set_next_mask(0x1);
	42	* Allow kernels to only use TPC 0 by default:
	43	* libsmctrl_set_global_mask(~0x1ull);
	44	* Allow kernels in a stream to only use TPCs 2, 3, and 4:
	45	* libsmctrl_set_stream_mask(stream, ~0b00111100ull);
	46	*
	47	* Note that the bitwise inversion operator (~, as used above) is very useful,
	48	* just be sure to apply it to 64-bit integer literals only! (~0x1 != ~0x1ull)
	49	*/
	50
	51	/* INFORMATIONAL FUNCTIONS */
	52
	53	// Get total number of TPCs on device number `dev`.
	54	extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev);
	55	// Get number of GPCs for devices number `dev`, and a GPC-indexed array
	56	// containing masks of which TPCs are associated with each GPC.
	57	// Note that the `nvdebug` module must be loaded to use this function.
	58	extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev);
	59
	60	#ifdef __cplusplus
	61	}
	62	#endif


diff --git a/libsmctrl_test_gpc_info.c b/libsmctrl_test_gpc_info.c new file mode 100644 index 0000000..93bfc1e --- /dev/null +++ b/libsmctrl_test_gpc_info.c
@@ -0,0 +1,14 @@
	1	#include <stdio.h>
	2	#include <stdint.h>
	3	#include "libsmctrl.h"
	4
	5	int main() {
	6	uint32_t num_gpcs;
	7	uint64_t* masks;
	8	libsmctrl_get_gpc_info(&num_gpcs, &masks, 1);
	9	printf("Num GPCs: %d\n", num_gpcs);
	10	for (int i = 0; i < num_gpcs; i++) {
	11	printf("Mask of TPCs associated with GPC %d: %#lx\n", i, masks[i]);
	12	}
	13	return 0;
	14	}