aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <jbakita@cs.unc.edu>2023-10-17 15:01:34 -0400
committerJoshua Bakita <jbakita@cs.unc.edu>2023-10-17 15:01:34 -0400
commit2c4b2d784815c5a2b4c49592b912c043f3d2a954 (patch)
tree60dea314e590ad93bbd32ce7ccb75d2b0beae73f
parent977f7eb86bb028f00b1b51c4f8c515087d37632b (diff)
Support global masking on x86_64 and aarch64 with CUDA 10.2
Also improve documentation and abort with an error message if attempting to set a global SM mask on an unsupported CUDA version. (Would crash/corrupt state before.) Also uncomment a line which errantly disabled global masking on CUDA 10.2 on aarch64. Tested with CUDA 10.2 on: - x86_64 (GTX 1060 3GB, driver 440.100, jbakita-old.cs.unc.edu) - aarch64 (Jetson TX2, driver r32.5, grizzly.cs.unc.edu)
-rw-r--r--libsmctrl.c122
1 files changed, 95 insertions, 27 deletions
diff --git a/libsmctrl.c b/libsmctrl.c
index 94578a1..dfd71b8 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -6,12 +6,18 @@
6#include <cuda.h> 6#include <cuda.h>
7 7
8#include <errno.h> 8#include <errno.h>
9#include <error.h>
9#include <fcntl.h> 10#include <fcntl.h>
10#include <stdint.h> 11#include <stdint.h>
11#include <stdio.h> 12#include <stdio.h>
12#include <unistd.h> 13#include <unistd.h>
13 14
14// Layout of mask control fields in CUDA's `globals` struct 15// In functions that do not return an error code, we favor terminating with an
16// error rather than merely printing a warning and continuing.
17#define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \
18 __VA_ARGS__)
19
20// Layout of mask control fields to match CUDA's static global struct
15struct global_sm_control { 21struct global_sm_control {
16 uint32_t enabled; 22 uint32_t enabled;
17 uint64_t mask; 23 uint64_t mask;
@@ -19,42 +25,100 @@ struct global_sm_control {
19 25
20/*** CUDA Globals Manipulation. CUDA 10.2 only ***/ 26/*** CUDA Globals Manipulation. CUDA 10.2 only ***/
21 27
22// Ends up being 0x7fb7fa3408 in some binaries 28// Ends up being 0x7fb7fa3408 in some binaries (CUDA 10.2, Jetson)
23static struct global_sm_control* g_sm_control = NULL; 29static struct global_sm_control* g_sm_control = NULL;
24 30
25/* Find the location of CUDA's `globals` struct and the SM mask control fields 31/* Find the location of CUDA's `globals` struct and the SM mask control fields
26 * No symbols are exported from within `globals`, so this has to do a very 32 * No symbols are exported from within `globals`, so this has to do a very
27 * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`. 33 * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`.
28 * Don't call this before the cuda library has been initialized. 34 * Don't call this before the CUDA library has been initialized.
35 * (Note that this appears to work, even if built on CUDA > 10.2.)
29 */ 36 */
30static void setup_sm_control_10() { 37static void setup_g_sm_control_10() {
31 if (g_sm_control) 38 if (g_sm_control)
32 return; 39 return;
33 // Defeat relocation. cudbgReportDriverApiErrorFlags is relocated by 40 // The location of the static global struct containing the global SM
34 // the loader, but not subject to ASLR (it's always at a constant 41 // mask field will vary depending on where the loader locates the CUDA
35 // offset in the loaded instance of libcuda.so). Our target is also at 42 // library. In order to reliably modify this struct, we must defeat
36 // a constant offset, so we can use the address of 43 // that relocation by deriving its location relative to a known
37 // cudbgReportDriverApiErrorFlags as a reference point. 44 // reference point.
38 // Note: cudbgReportDriverApiErrorFlags is currently the closest known 45 //
39 // symbol to **the table**. cudbgDebuggerInitialized is the closest to 46 // == Choosing a Reference Point:
40 // globals itself (+7424 == SM mask control), but we perfer the table 47 // The cudbg* symbols appear to be relocated to a constant offset from
41 // lookup approach for now, as that's what cuDeviceGetCount() does. 48 // the globals structure, and so we use the address of the symbol
49 // `cudbgReportDriverApiErrorFlags` as our reference point. (This ends
50 // up being the closest to an intermediate table we use as part of our
51 // lookup---process discussed below.)
42 extern uint32_t cudbgReportDriverApiErrorFlags; 52 extern uint32_t cudbgReportDriverApiErrorFlags;
43 uint32_t* sym = 0;//&cudbgReportDriverApiErrorFlags; 53 uint32_t* sym = &cudbgReportDriverApiErrorFlags;
44 // In some binaries, the following works out to 0x7fb7ea6000, and 54
45 // that's what shows up in the adrp instruction in cuDeviceGetCount() 55 // == Deriving Location:
46 // in the lead-up to get globals.numDevices. Find this offset by 56 // The number of CUDA devices available is co-located in the same CUDA
47 // calling cuDeviceGetCount(0xdeadbeef), catching the segfault in GDB, 57 // globals structure that we aim to modify the SM mask field in. The
48 // disassembling the prior instructions, taking the adrp constant, and 58 // value in that field can be assigned to a user-controlled pointer via
49 // subtracting the address of cudbgReportDriverApiErrorFlags from it. 59 // the cuDeviceGetCount() CUDA Driver Library function. To determine
60 // the location of thu structure, we pass a bad address to the function
61 // and dissasemble the code adjacent to where it segfaults. On the
62 // Jetson Xavier with CUDA 10.2, the assembly is as follows:
63 // (reg x19 contains cuDeviceGetCount()'s user-provided pointer)
64 // ...
65 // 0x0000007fb71454b4: cbz x19, 0x7fb71454d0 // Check ptr non-zero
66 // 0x0000007fb71454b8: adrp x1, 0x7fb7ea6000 // Addr of lookup tbl
67 // 0x0000007fb71454bc: ldr x1, [x1,#3672] // Get addr of globals
68 // 0x0000007fb71454c0: ldr w1, [x1,#904] // Get count from globals
69 // 0x0000007fb71454c4: str w1, [x19] // Store count at user addr
70 // ...
71 // In this assembly, we can identify that CUDA uses an internal lookup
72 // table to identify the location of the globals structure (pointer
73 // 459 in the table; offset 3672). After obtaining this pointer, it
74 // advances to offset 904 in the global structure, dereferences the
75 // value stored there, and then attempts to store it at the user-
76 // -provided address (register x19). This final line will trigger a
77 // segfault if a non-zero bad address is passed to cuDeviceGetCount().
78 //
79 // On x86_64:
80 // (reg %rbx contains cuDeviceGetCount()'s user-provided pointer)
81 // ...
82 // 0x00007ffff6cac01f: test %rbx,%rbx // Check ptr non-zero
83 // 0x00007ffff6cac022: je 0x7ffff6cac038 // ''
84 // 0x00007ffff6cac024: mov 0x100451d(%rip),%rdx # 0x7ffff7cb0548 // Get globals base address from offset from instruction pointer
85 // 0x00007ffff6cac02b: mov 0x308(%rdx),%edx // Take globals base address, add an offset of 776, and dereference
86 // 0x00007ffff6cac031: mov %edx,(%rbx) // Store count at user addr
87 // ...
88 // Note that this does not use an intermediate lookup table.
89 //
90 // [Aside: cudbgReportDriverApiErrorFlags is currently the closest
91 // symbol to **the lookup table**. cudbgDebuggerInitialized is closer
92 // to the globals struct itself (+7424 == SM mask control), but we
93 // perfer the table lookup approach for now, as that's what
94 // cuDeviceGetCount() does.]
95
96#if __aarch64__
97 // In my test binary, the lookup table is at address 0x7fb7ea6000, and
98 // this is 1029868 bytes before the address for
99 // cudbgReportDriverApiErrorFlags. Use this information to derive the
100 // location of the lookup in our binary (defeat relocation).
50 uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868); 101 uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868);
51 // Address of `globals` is at offset 3672 (entry 459?) 102 // Address of `globals` is at offset 3672 (entry 459?) in the table
52 uintptr_t globals_addr = *(tbl_base + 459); // Offset 3672 on aarch64 103 uintptr_t globals_addr = *(tbl_base + 459);
53 // SM mask control is at offset 4888 in the `globals` struct 104 // SM mask control is at offset 4888 in the `globals` struct
105 // [Device count at offset 904 (0x388)]
54 g_sm_control = (struct global_sm_control*)(globals_addr + 4888); 106 g_sm_control = (struct global_sm_control*)(globals_addr + 4888);
107#endif // __aarch64__
108#if __x86_64__
109 // In my test binary, globals is at 0x7ffff7cb0548, which is 1103576
110 // bytes before the address for cudbgReportDriverApiErrorFlags
111 // (0x7ffff7dbdc20). Use this offset to defeat relocation.
112 uintptr_t globals_addr = *(uintptr_t*)((uintptr_t)sym - 1103576);
113 // SM mask control is at offset 4728 in the `globals` struct
114 // [Device count at offset 776 (0x308)]
115 g_sm_control = (struct global_sm_control*)(globals_addr + 4728);
116#endif // __x86_64__
55 // SM mask should be empty by default 117 // SM mask should be empty by default
56 if (g_sm_control->enabled || g_sm_control->mask) 118 if (g_sm_control->enabled || g_sm_control->mask)
57 fprintf(stderr, "Warning: Found non-NULL SM disable mask during setup! g_sm_control is likely invalid---use at own risk.\n"); 119 fprintf(stderr, "Warning: Found non-empty SM disable mask "
120 "during setup! libsmctrl_set_global_mask() is "
121 "unlikely to work on this platform!\n");
58} 122}
59 123
60/*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ 124/*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/
@@ -119,19 +183,23 @@ static void setup_sm_control_11() {
119 fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res); 183 fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res);
120} 184}
121 185
122// Common masking control 186// Set default mask for all launches
123void libsmctrl_set_global_mask(uint64_t mask) { 187void libsmctrl_set_global_mask(uint64_t mask) {
124 int ver; 188 int ver;
125 cuDriverGetVersion(&ver); 189 cuDriverGetVersion(&ver);
126 if (ver <= 10020) { 190 if (ver == 10020) {
127 if (!g_sm_control) 191 if (!g_sm_control)
128 setup_sm_control_10(); 192 setup_g_sm_control_10();
129 g_sm_control->mask = mask; 193 g_sm_control->mask = mask;
130 g_sm_control->enabled = 1; 194 g_sm_control->enabled = 1;
131 } else { 195 } else if (ver > 10020) {
132 if (!sm_control_setup_called) 196 if (!sm_control_setup_called)
133 setup_sm_control_11(); 197 setup_sm_control_11();
134 g_sm_mask = mask; 198 g_sm_mask = mask;
199 } else { // < CUDA 10.2
200 abort(1, ENOSYS, "Global masking requires at least CUDA 10.2; "
201 "this application is using CUDA %d.%d",
202 ver / 1000, (ver % 100));
135 } 203 }
136} 204}
137 205