Support global masking on x86_64 and aarch64 with CUDA 10.2

Also improve documentation and abort with an error message if attempting to set a global SM mask on an unsupported CUDA version. (Would crash/corrupt state before.) Also uncomment a line which errantly disabled global masking on CUDA 10.2 on aarch64. Tested with CUDA 10.2 on: - x86_64 (GTX 1060 3GB, driver 440.100, jbakita-old.cs.unc.edu) - aarch64 (Jetson TX2, driver r32.5, grizzly.cs.unc.edu)
author: Joshua Bakita <jbakita@cs.unc.edu> 2023-10-17 15:01:34 -0400
committer: Joshua Bakita <jbakita@cs.unc.edu> 2023-10-17 15:01:34 -0400
commit: 2c4b2d784815c5a2b4c49592b912c043f3d2a954 (patch)
tree: 60dea314e590ad93bbd32ce7ccb75d2b0beae73f
parent: 977f7eb86bb028f00b1b51c4f8c515087d37632b (diff)
1 files changed, 95 insertions, 27 deletions
diff --git a/libsmctrl.c b/libsmctrl.c
index 94578a1..dfd71b8 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -6,12 +6,18 @@
 #include <cuda.h>
 #include <errno.h>
+#include <error.h>
 #include <fcntl.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <unistd.h>
-// Layout of mask control fields in CUDA's `globals` struct
+// In functions that do not return an error code, we favor terminating with an
+// error rather than merely printing a warning and continuing.
+#define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \
+                                             __VA_ARGS__)
+// Layout of mask control fields to match CUDA's static global struct
 struct global_sm_control {
        uint32_t enabled;
        uint64_t mask;
@@ -19,42 +25,100 @@ struct global_sm_control {
 /*** CUDA Globals Manipulation. CUDA 10.2 only ***/
-// Ends up being 0x7fb7fa3408 in some binaries
+// Ends up being 0x7fb7fa3408 in some binaries (CUDA 10.2, Jetson)
 static struct global_sm_control* g_sm_control = NULL;
 /* Find the location of CUDA's `globals` struct and the SM mask control fields
 * No symbols are exported from within `globals`, so this has to do a very
 * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`.
- * Don't call this before the cuda library has been initialized.
+ * Don't call this before the CUDA library has been initialized.
+ * (Note that this appears to work, even if built on CUDA > 10.2.)
 */
-static void setup_sm_control_10() {
+static void setup_g_sm_control_10() {
        if (g_sm_control)
                return;
-        // Defeat relocation. cudbgReportDriverApiErrorFlags is relocated by
+        // The location of the static global struct containing the global SM
-        // the loader, but not subject to ASLR (it's always at a constant
+        // mask field will vary depending on where the loader locates the CUDA
-        // offset in the loaded instance of libcuda.so). Our target is also at
+        // library. In order to reliably modify this struct, we must defeat
-        // a constant offset, so we can use the address of
+        // that relocation by deriving its location relative to a known
-        // cudbgReportDriverApiErrorFlags as a reference point.
+        // reference point.
-        // Note: cudbgReportDriverApiErrorFlags is currently the closest known
+        //
-        // symbol to **the table**. cudbgDebuggerInitialized is the closest to
+        // == Choosing a Reference Point:
-        // globals itself (+7424 == SM mask control), but we perfer the table
+        // The cudbg* symbols appear to be relocated to a constant offset from
-        // lookup approach for now, as that's what cuDeviceGetCount() does.
+        // the globals structure, and so we use the address of the symbol
+        // `cudbgReportDriverApiErrorFlags` as our reference point. (This ends
+        // up being the closest to an intermediate table we use as part of our
+        // lookup---process discussed below.)
        extern uint32_t cudbgReportDriverApiErrorFlags;
-        uint32_t* sym = 0;//&cudbgReportDriverApiErrorFlags;
+        uint32_t* sym = &cudbgReportDriverApiErrorFlags;
-        // In some binaries, the following works out to 0x7fb7ea6000, and
-        // that's what shows up in the adrp instruction in cuDeviceGetCount()
+        // == Deriving Location:
-        // in the lead-up to get globals.numDevices. Find this offset by
+        // The number of CUDA devices available is co-located in the same CUDA
-        // calling cuDeviceGetCount(0xdeadbeef), catching the segfault in GDB,
+        // globals structure that we aim to modify the SM mask field in. The
-        // disassembling the prior instructions, taking the adrp constant, and
+        // value in that field can be assigned to a user-controlled pointer via
-        // subtracting the address of cudbgReportDriverApiErrorFlags from it.
+        // the cuDeviceGetCount() CUDA Driver Library function. To determine
+        // the location of thu structure, we pass a bad address to the function
+        // and dissasemble the code adjacent to where it segfaults. On the
+        // Jetson Xavier with CUDA 10.2, the assembly is as follows:
+        //   (reg x19 contains cuDeviceGetCount()'s user-provided pointer)
+        //   ...
+        //   0x0000007fb71454b4:  cbz   x19, 0x7fb71454d0 // Check ptr non-zero
+        //   0x0000007fb71454b8:  adrp  x1, 0x7fb7ea6000 // Addr of lookup tbl
+        //   0x0000007fb71454bc:  ldr   x1, [x1,#3672] // Get addr of globals
+        //   0x0000007fb71454c0:  ldr   w1, [x1,#904] // Get count from globals
+        //   0x0000007fb71454c4:  str   w1, [x19] // Store count at user addr
+        //   ...
+        // In this assembly, we can identify that CUDA uses an internal lookup
+        // table to identify the location of the globals structure (pointer
+        // 459 in the table; offset 3672). After obtaining this pointer, it
+        // advances to offset 904 in the global structure, dereferences the
+        // value stored there, and then attempts to store it at the user-
+        // -provided address (register x19). This final line will trigger a
+        // segfault if a non-zero bad address is passed to cuDeviceGetCount().
+        //
+        // On x86_64:
+        //   (reg %rbx contains cuDeviceGetCount()'s user-provided pointer)
+        //   ...
+        //   0x00007ffff6cac01f:  test  %rbx,%rbx // Check ptr non-zero
+        //   0x00007ffff6cac022:  je    0x7ffff6cac038 // ''
+        //   0x00007ffff6cac024:  mov   0x100451d(%rip),%rdx # 0x7ffff7cb0548 // Get globals base address from offset from instruction pointer
+        //   0x00007ffff6cac02b:  mov   0x308(%rdx),%edx // Take globals base address, add an offset of 776, and dereference
+        //   0x00007ffff6cac031:  mov   %edx,(%rbx) // Store count at user addr
+        //   ...
+        // Note that this does not use an intermediate lookup table.
+        //
+        // [Aside: cudbgReportDriverApiErrorFlags is currently the closest
+        // symbol to **the lookup table**. cudbgDebuggerInitialized is closer
+        // to the globals struct itself (+7424 == SM mask control), but we
+        // perfer the table lookup approach for now, as that's what
+        // cuDeviceGetCount() does.]
+#if __aarch64__
+        // In my test binary, the lookup table is at address 0x7fb7ea6000, and
+        // this is 1029868 bytes before the address for
+        // cudbgReportDriverApiErrorFlags. Use this information to derive the
+        // location of the lookup in our binary (defeat relocation).
        uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868);
-        // Address of `globals` is at offset 3672 (entry 459?)
+        // Address of `globals` is at offset 3672 (entry 459?) in the table
-        uintptr_t globals_addr = *(tbl_base + 459); // Offset 3672 on aarch64
+        uintptr_t globals_addr = *(tbl_base + 459);
        // SM mask control is at offset 4888 in the `globals` struct
+        // [Device count at offset 904 (0x388)]
        g_sm_control = (struct global_sm_control*)(globals_addr + 4888);
+#endif // __aarch64__
+#if __x86_64__
+        // In my test binary, globals is at 0x7ffff7cb0548, which is 1103576
+        // bytes before the address for cudbgReportDriverApiErrorFlags
+        // (0x7ffff7dbdc20). Use this offset to defeat relocation.
+        uintptr_t globals_addr = *(uintptr_t*)((uintptr_t)sym - 1103576);
+        // SM mask control is at offset 4728 in the `globals` struct
+        // [Device count at offset 776 (0x308)]
+        g_sm_control = (struct global_sm_control*)(globals_addr + 4728);
+#endif // __x86_64__
        // SM mask should be empty by default
        if (g_sm_control->enabled || g_sm_control->mask)
-                fprintf(stderr, "Warning: Found non-NULL SM disable mask during setup! g_sm_control is likely invalid---use at own risk.\n");
+                fprintf(stderr, "Warning: Found non-empty SM disable mask "
+                                "during setup! libsmctrl_set_global_mask() is "
+                                "unlikely to work on this platform!\n");
 }
 /*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/
@@ -119,19 +183,23 @@ static void setup_sm_control_11() {
                fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res);
 }
-// Common masking control
+// Set default mask for all launches
 void libsmctrl_set_global_mask(uint64_t mask) {
        int ver;
        cuDriverGetVersion(&ver);
-        if (ver <= 10020) {
+        if (ver == 10020) {
                if (!g_sm_control)
-                        setup_sm_control_10();
+                        setup_g_sm_control_10();
                g_sm_control->mask = mask;
                g_sm_control->enabled = 1;
-        } else {
+        } else if (ver > 10020) {
                if (!sm_control_setup_called)
                        setup_sm_control_11();
                g_sm_mask = mask;
+        } else { // < CUDA 10.2
+                abort(1, ENOSYS, "Global masking requires at least CUDA 10.2; "
+                                 "this application is using CUDA %d.%d",
+                                 ver / 1000, (ver % 100));
        }
 }
author	Joshua Bakita <jbakita@cs.unc.edu>	2023-10-17 15:01:34 -0400
committer	Joshua Bakita <jbakita@cs.unc.edu>	2023-10-17 15:01:34 -0400
commit	2c4b2d784815c5a2b4c49592b912c043f3d2a954 (patch)
tree	60dea314e590ad93bbd32ce7ccb75d2b0beae73f
parent	977f7eb86bb028f00b1b51c4f8c515087d37632b (diff)

diff --git a/libsmctrl.c b/libsmctrl.c index 94578a1..dfd71b8 100644 --- a/libsmctrl.c +++ b/libsmctrl.c
@@ -6,12 +6,18 @@
6	#include <cuda.h>	6	#include <cuda.h>
7		7
8	#include <errno.h>	8	#include <errno.h>
		9	#include <error.h>
9	#include <fcntl.h>	10	#include <fcntl.h>
10	#include <stdint.h>	11	#include <stdint.h>
11	#include <stdio.h>	12	#include <stdio.h>
12	#include <unistd.h>	13	#include <unistd.h>
13		14
14	// Layout of mask control fields in CUDA's `globals` struct	15	// In functions that do not return an error code, we favor terminating with an
		16	// error rather than merely printing a warning and continuing.
		17	#define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \
		18	__VA_ARGS__)
		19
		20	// Layout of mask control fields to match CUDA's static global struct
15	struct global_sm_control {	21	struct global_sm_control {
16	uint32_t enabled;	22	uint32_t enabled;
17	uint64_t mask;	23	uint64_t mask;
@@ -19,42 +25,100 @@ struct global_sm_control {
19		25
20	/* CUDA Globals Manipulation. CUDA 10.2 only */	26	/* CUDA Globals Manipulation. CUDA 10.2 only */
21		27
22	// Ends up being 0x7fb7fa3408 in some binaries	28	// Ends up being 0x7fb7fa3408 in some binaries (CUDA 10.2, Jetson)
23	static struct global_sm_control* g_sm_control = NULL;	29	static struct global_sm_control* g_sm_control = NULL;
24		30
25	/* Find the location of CUDA's `globals` struct and the SM mask control fields	31	/* Find the location of CUDA's `globals` struct and the SM mask control fields
26	* No symbols are exported from within `globals`, so this has to do a very	32	* No symbols are exported from within `globals`, so this has to do a very
27	* messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`.	33	* messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`.
28	* Don't call this before the cuda library has been initialized.	34	* Don't call this before the CUDA library has been initialized.
		35	* (Note that this appears to work, even if built on CUDA > 10.2.)
29	*/	36	*/
30	static void setup_sm_control_10() {	37	static void setup_g_sm_control_10() {
31	if (g_sm_control)	38	if (g_sm_control)
32	return;	39	return;
33	// Defeat relocation. cudbgReportDriverApiErrorFlags is relocated by	40	// The location of the static global struct containing the global SM
34	// the loader, but not subject to ASLR (it's always at a constant	41	// mask field will vary depending on where the loader locates the CUDA
35	// offset in the loaded instance of libcuda.so). Our target is also at	42	// library. In order to reliably modify this struct, we must defeat
36	// a constant offset, so we can use the address of	43	// that relocation by deriving its location relative to a known
37	// cudbgReportDriverApiErrorFlags as a reference point.	44	// reference point.
38	// Note: cudbgReportDriverApiErrorFlags is currently the closest known	45	//
39	// symbol to the table. cudbgDebuggerInitialized is the closest to	46	// == Choosing a Reference Point:
40	// globals itself (+7424 == SM mask control), but we perfer the table	47	// The cudbg* symbols appear to be relocated to a constant offset from
41	// lookup approach for now, as that's what cuDeviceGetCount() does.	48	// the globals structure, and so we use the address of the symbol
		49	// `cudbgReportDriverApiErrorFlags` as our reference point. (This ends
		50	// up being the closest to an intermediate table we use as part of our
		51	// lookup---process discussed below.)
42	extern uint32_t cudbgReportDriverApiErrorFlags;	52	extern uint32_t cudbgReportDriverApiErrorFlags;
43	uint32_t* sym = 0;//&cudbgReportDriverApiErrorFlags;	53	uint32_t* sym = &cudbgReportDriverApiErrorFlags;
44	// In some binaries, the following works out to 0x7fb7ea6000, and	54
45	// that's what shows up in the adrp instruction in cuDeviceGetCount()	55	// == Deriving Location:
46	// in the lead-up to get globals.numDevices. Find this offset by	56	// The number of CUDA devices available is co-located in the same CUDA
47	// calling cuDeviceGetCount(0xdeadbeef), catching the segfault in GDB,	57	// globals structure that we aim to modify the SM mask field in. The
48	// disassembling the prior instructions, taking the adrp constant, and	58	// value in that field can be assigned to a user-controlled pointer via
49	// subtracting the address of cudbgReportDriverApiErrorFlags from it.	59	// the cuDeviceGetCount() CUDA Driver Library function. To determine
		60	// the location of thu structure, we pass a bad address to the function
		61	// and dissasemble the code adjacent to where it segfaults. On the
		62	// Jetson Xavier with CUDA 10.2, the assembly is as follows:
		63	// (reg x19 contains cuDeviceGetCount()'s user-provided pointer)
		64	// ...
		65	// 0x0000007fb71454b4: cbz x19, 0x7fb71454d0 // Check ptr non-zero
		66	// 0x0000007fb71454b8: adrp x1, 0x7fb7ea6000 // Addr of lookup tbl
		67	// 0x0000007fb71454bc: ldr x1, [x1,#3672] // Get addr of globals
		68	// 0x0000007fb71454c0: ldr w1, [x1,#904] // Get count from globals
		69	// 0x0000007fb71454c4: str w1, [x19] // Store count at user addr
		70	// ...
		71	// In this assembly, we can identify that CUDA uses an internal lookup
		72	// table to identify the location of the globals structure (pointer
		73	// 459 in the table; offset 3672). After obtaining this pointer, it
		74	// advances to offset 904 in the global structure, dereferences the
		75	// value stored there, and then attempts to store it at the user-
		76	// -provided address (register x19). This final line will trigger a
		77	// segfault if a non-zero bad address is passed to cuDeviceGetCount().
		78	//
		79	// On x86_64:
		80	// (reg %rbx contains cuDeviceGetCount()'s user-provided pointer)
		81	// ...
		82	// 0x00007ffff6cac01f: test %rbx,%rbx // Check ptr non-zero
		83	// 0x00007ffff6cac022: je 0x7ffff6cac038 // ''
		84	// 0x00007ffff6cac024: mov 0x100451d(%rip),%rdx # 0x7ffff7cb0548 // Get globals base address from offset from instruction pointer
		85	// 0x00007ffff6cac02b: mov 0x308(%rdx),%edx // Take globals base address, add an offset of 776, and dereference
		86	// 0x00007ffff6cac031: mov %edx,(%rbx) // Store count at user addr
		87	// ...
		88	// Note that this does not use an intermediate lookup table.
		89	//
		90	// [Aside: cudbgReportDriverApiErrorFlags is currently the closest
		91	// symbol to the lookup table. cudbgDebuggerInitialized is closer
		92	// to the globals struct itself (+7424 == SM mask control), but we
		93	// perfer the table lookup approach for now, as that's what
		94	// cuDeviceGetCount() does.]
		95
		96	#if __aarch64__
		97	// In my test binary, the lookup table is at address 0x7fb7ea6000, and
		98	// this is 1029868 bytes before the address for
		99	// cudbgReportDriverApiErrorFlags. Use this information to derive the
		100	// location of the lookup in our binary (defeat relocation).
50	uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868);	101	uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868);
51	// Address of `globals` is at offset 3672 (entry 459?)	102	// Address of `globals` is at offset 3672 (entry 459?) in the table
52	uintptr_t globals_addr = *(tbl_base + 459); // Offset 3672 on aarch64	103	uintptr_t globals_addr = *(tbl_base + 459);
53	// SM mask control is at offset 4888 in the `globals` struct	104	// SM mask control is at offset 4888 in the `globals` struct
		105	// [Device count at offset 904 (0x388)]
54	g_sm_control = (struct global_sm_control*)(globals_addr + 4888);	106	g_sm_control = (struct global_sm_control*)(globals_addr + 4888);
		107	#endif // __aarch64__
		108	#if __x86_64__
		109	// In my test binary, globals is at 0x7ffff7cb0548, which is 1103576
		110	// bytes before the address for cudbgReportDriverApiErrorFlags
		111	// (0x7ffff7dbdc20). Use this offset to defeat relocation.
		112	uintptr_t globals_addr = (uintptr_t)((uintptr_t)sym - 1103576);
		113	// SM mask control is at offset 4728 in the `globals` struct
		114	// [Device count at offset 776 (0x308)]
		115	g_sm_control = (struct global_sm_control*)(globals_addr + 4728);
		116	#endif // __x86_64__
55	// SM mask should be empty by default	117	// SM mask should be empty by default
56	if (g_sm_control->enabled \|\| g_sm_control->mask)	118	if (g_sm_control->enabled \|\| g_sm_control->mask)
57	fprintf(stderr, "Warning: Found non-NULL SM disable mask during setup! g_sm_control is likely invalid---use at own risk.\n");	119	fprintf(stderr, "Warning: Found non-empty SM disable mask "
		120	"during setup! libsmctrl_set_global_mask() is "
		121	"unlikely to work on this platform!\n");
58	}	122	}
59		123
60	/* QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ */	124	/* QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ */
@@ -119,19 +183,23 @@ static void setup_sm_control_11() {
119	fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res);	183	fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res);
120	}	184	}
121		185
122	// Common masking control	186	// Set default mask for all launches
123	void libsmctrl_set_global_mask(uint64_t mask) {	187	void libsmctrl_set_global_mask(uint64_t mask) {
124	int ver;	188	int ver;
125	cuDriverGetVersion(&ver);	189	cuDriverGetVersion(&ver);
126	if (ver <= 10020) {	190	if (ver == 10020) {
127	if (!g_sm_control)	191	if (!g_sm_control)
128	setup_sm_control_10();	192	setup_g_sm_control_10();
129	g_sm_control->mask = mask;	193	g_sm_control->mask = mask;
130	g_sm_control->enabled = 1;	194	g_sm_control->enabled = 1;
131	} else {	195	} else if (ver > 10020) {
132	if (!sm_control_setup_called)	196	if (!sm_control_setup_called)
133	setup_sm_control_11();	197	setup_sm_control_11();
134	g_sm_mask = mask;	198	g_sm_mask = mask;
		199	} else { // < CUDA 10.2
		200	abort(1, ENOSYS, "Global masking requires at least CUDA 10.2; "
		201	"this application is using CUDA %d.%d",
		202	ver / 1000, (ver % 100));
135	}	203	}
136	}	204	}
137		205