diff options
author | Joshua Bakita <jbakita@cs.unc.edu> | 2023-10-17 15:01:34 -0400 |
---|---|---|
committer | Joshua Bakita <jbakita@cs.unc.edu> | 2023-10-17 15:01:34 -0400 |
commit | 2c4b2d784815c5a2b4c49592b912c043f3d2a954 (patch) | |
tree | 60dea314e590ad93bbd32ce7ccb75d2b0beae73f | |
parent | 977f7eb86bb028f00b1b51c4f8c515087d37632b (diff) |
Support global masking on x86_64 and aarch64 with CUDA 10.2
Also improve documentation and abort with an error message if
attempting to set a global SM mask on an unsupported CUDA version.
(Would crash/corrupt state before.)
Also uncomment a line which errantly disabled global masking on
CUDA 10.2 on aarch64.
Tested with CUDA 10.2 on:
- x86_64 (GTX 1060 3GB, driver 440.100, jbakita-old.cs.unc.edu)
- aarch64 (Jetson TX2, driver r32.5, grizzly.cs.unc.edu)
-rw-r--r-- | libsmctrl.c | 122 |
1 files changed, 95 insertions, 27 deletions
diff --git a/libsmctrl.c b/libsmctrl.c index 94578a1..dfd71b8 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
@@ -6,12 +6,18 @@ | |||
6 | #include <cuda.h> | 6 | #include <cuda.h> |
7 | 7 | ||
8 | #include <errno.h> | 8 | #include <errno.h> |
9 | #include <error.h> | ||
9 | #include <fcntl.h> | 10 | #include <fcntl.h> |
10 | #include <stdint.h> | 11 | #include <stdint.h> |
11 | #include <stdio.h> | 12 | #include <stdio.h> |
12 | #include <unistd.h> | 13 | #include <unistd.h> |
13 | 14 | ||
14 | // Layout of mask control fields in CUDA's `globals` struct | 15 | // In functions that do not return an error code, we favor terminating with an |
16 | // error rather than merely printing a warning and continuing. | ||
17 | #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ | ||
18 | __VA_ARGS__) | ||
19 | |||
20 | // Layout of mask control fields to match CUDA's static global struct | ||
15 | struct global_sm_control { | 21 | struct global_sm_control { |
16 | uint32_t enabled; | 22 | uint32_t enabled; |
17 | uint64_t mask; | 23 | uint64_t mask; |
@@ -19,42 +25,100 @@ struct global_sm_control { | |||
19 | 25 | ||
20 | /*** CUDA Globals Manipulation. CUDA 10.2 only ***/ | 26 | /*** CUDA Globals Manipulation. CUDA 10.2 only ***/ |
21 | 27 | ||
22 | // Ends up being 0x7fb7fa3408 in some binaries | 28 | // Ends up being 0x7fb7fa3408 in some binaries (CUDA 10.2, Jetson) |
23 | static struct global_sm_control* g_sm_control = NULL; | 29 | static struct global_sm_control* g_sm_control = NULL; |
24 | 30 | ||
25 | /* Find the location of CUDA's `globals` struct and the SM mask control fields | 31 | /* Find the location of CUDA's `globals` struct and the SM mask control fields |
26 | * No symbols are exported from within `globals`, so this has to do a very | 32 | * No symbols are exported from within `globals`, so this has to do a very |
27 | * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`. | 33 | * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`. |
28 | * Don't call this before the cuda library has been initialized. | 34 | * Don't call this before the CUDA library has been initialized. |
35 | * (Note that this appears to work, even if built on CUDA > 10.2.) | ||
29 | */ | 36 | */ |
30 | static void setup_sm_control_10() { | 37 | static void setup_g_sm_control_10() { |
31 | if (g_sm_control) | 38 | if (g_sm_control) |
32 | return; | 39 | return; |
33 | // Defeat relocation. cudbgReportDriverApiErrorFlags is relocated by | 40 | // The location of the static global struct containing the global SM |
34 | // the loader, but not subject to ASLR (it's always at a constant | 41 | // mask field will vary depending on where the loader locates the CUDA |
35 | // offset in the loaded instance of libcuda.so). Our target is also at | 42 | // library. In order to reliably modify this struct, we must defeat |
36 | // a constant offset, so we can use the address of | 43 | // that relocation by deriving its location relative to a known |
37 | // cudbgReportDriverApiErrorFlags as a reference point. | 44 | // reference point. |
38 | // Note: cudbgReportDriverApiErrorFlags is currently the closest known | 45 | // |
39 | // symbol to **the table**. cudbgDebuggerInitialized is the closest to | 46 | // == Choosing a Reference Point: |
40 | // globals itself (+7424 == SM mask control), but we perfer the table | 47 | // The cudbg* symbols appear to be relocated to a constant offset from |
41 | // lookup approach for now, as that's what cuDeviceGetCount() does. | 48 | // the globals structure, and so we use the address of the symbol |
49 | // `cudbgReportDriverApiErrorFlags` as our reference point. (This ends | ||
50 | // up being the closest to an intermediate table we use as part of our | ||
51 | // lookup---process discussed below.) | ||
42 | extern uint32_t cudbgReportDriverApiErrorFlags; | 52 | extern uint32_t cudbgReportDriverApiErrorFlags; |
43 | uint32_t* sym = 0;//&cudbgReportDriverApiErrorFlags; | 53 | uint32_t* sym = &cudbgReportDriverApiErrorFlags; |
44 | // In some binaries, the following works out to 0x7fb7ea6000, and | 54 | |
45 | // that's what shows up in the adrp instruction in cuDeviceGetCount() | 55 | // == Deriving Location: |
46 | // in the lead-up to get globals.numDevices. Find this offset by | 56 | // The number of CUDA devices available is co-located in the same CUDA |
47 | // calling cuDeviceGetCount(0xdeadbeef), catching the segfault in GDB, | 57 | // globals structure that we aim to modify the SM mask field in. The |
48 | // disassembling the prior instructions, taking the adrp constant, and | 58 | // value in that field can be assigned to a user-controlled pointer via |
49 | // subtracting the address of cudbgReportDriverApiErrorFlags from it. | 59 | // the cuDeviceGetCount() CUDA Driver Library function. To determine |
60 | // the location of thu structure, we pass a bad address to the function | ||
61 | // and dissasemble the code adjacent to where it segfaults. On the | ||
62 | // Jetson Xavier with CUDA 10.2, the assembly is as follows: | ||
63 | // (reg x19 contains cuDeviceGetCount()'s user-provided pointer) | ||
64 | // ... | ||
65 | // 0x0000007fb71454b4: cbz x19, 0x7fb71454d0 // Check ptr non-zero | ||
66 | // 0x0000007fb71454b8: adrp x1, 0x7fb7ea6000 // Addr of lookup tbl | ||
67 | // 0x0000007fb71454bc: ldr x1, [x1,#3672] // Get addr of globals | ||
68 | // 0x0000007fb71454c0: ldr w1, [x1,#904] // Get count from globals | ||
69 | // 0x0000007fb71454c4: str w1, [x19] // Store count at user addr | ||
70 | // ... | ||
71 | // In this assembly, we can identify that CUDA uses an internal lookup | ||
72 | // table to identify the location of the globals structure (pointer | ||
73 | // 459 in the table; offset 3672). After obtaining this pointer, it | ||
74 | // advances to offset 904 in the global structure, dereferences the | ||
75 | // value stored there, and then attempts to store it at the user- | ||
76 | // -provided address (register x19). This final line will trigger a | ||
77 | // segfault if a non-zero bad address is passed to cuDeviceGetCount(). | ||
78 | // | ||
79 | // On x86_64: | ||
80 | // (reg %rbx contains cuDeviceGetCount()'s user-provided pointer) | ||
81 | // ... | ||
82 | // 0x00007ffff6cac01f: test %rbx,%rbx // Check ptr non-zero | ||
83 | // 0x00007ffff6cac022: je 0x7ffff6cac038 // '' | ||
84 | // 0x00007ffff6cac024: mov 0x100451d(%rip),%rdx # 0x7ffff7cb0548 // Get globals base address from offset from instruction pointer | ||
85 | // 0x00007ffff6cac02b: mov 0x308(%rdx),%edx // Take globals base address, add an offset of 776, and dereference | ||
86 | // 0x00007ffff6cac031: mov %edx,(%rbx) // Store count at user addr | ||
87 | // ... | ||
88 | // Note that this does not use an intermediate lookup table. | ||
89 | // | ||
90 | // [Aside: cudbgReportDriverApiErrorFlags is currently the closest | ||
91 | // symbol to **the lookup table**. cudbgDebuggerInitialized is closer | ||
92 | // to the globals struct itself (+7424 == SM mask control), but we | ||
93 | // perfer the table lookup approach for now, as that's what | ||
94 | // cuDeviceGetCount() does.] | ||
95 | |||
96 | #if __aarch64__ | ||
97 | // In my test binary, the lookup table is at address 0x7fb7ea6000, and | ||
98 | // this is 1029868 bytes before the address for | ||
99 | // cudbgReportDriverApiErrorFlags. Use this information to derive the | ||
100 | // location of the lookup in our binary (defeat relocation). | ||
50 | uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868); | 101 | uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868); |
51 | // Address of `globals` is at offset 3672 (entry 459?) | 102 | // Address of `globals` is at offset 3672 (entry 459?) in the table |
52 | uintptr_t globals_addr = *(tbl_base + 459); // Offset 3672 on aarch64 | 103 | uintptr_t globals_addr = *(tbl_base + 459); |
53 | // SM mask control is at offset 4888 in the `globals` struct | 104 | // SM mask control is at offset 4888 in the `globals` struct |
105 | // [Device count at offset 904 (0x388)] | ||
54 | g_sm_control = (struct global_sm_control*)(globals_addr + 4888); | 106 | g_sm_control = (struct global_sm_control*)(globals_addr + 4888); |
107 | #endif // __aarch64__ | ||
108 | #if __x86_64__ | ||
109 | // In my test binary, globals is at 0x7ffff7cb0548, which is 1103576 | ||
110 | // bytes before the address for cudbgReportDriverApiErrorFlags | ||
111 | // (0x7ffff7dbdc20). Use this offset to defeat relocation. | ||
112 | uintptr_t globals_addr = *(uintptr_t*)((uintptr_t)sym - 1103576); | ||
113 | // SM mask control is at offset 4728 in the `globals` struct | ||
114 | // [Device count at offset 776 (0x308)] | ||
115 | g_sm_control = (struct global_sm_control*)(globals_addr + 4728); | ||
116 | #endif // __x86_64__ | ||
55 | // SM mask should be empty by default | 117 | // SM mask should be empty by default |
56 | if (g_sm_control->enabled || g_sm_control->mask) | 118 | if (g_sm_control->enabled || g_sm_control->mask) |
57 | fprintf(stderr, "Warning: Found non-NULL SM disable mask during setup! g_sm_control is likely invalid---use at own risk.\n"); | 119 | fprintf(stderr, "Warning: Found non-empty SM disable mask " |
120 | "during setup! libsmctrl_set_global_mask() is " | ||
121 | "unlikely to work on this platform!\n"); | ||
58 | } | 122 | } |
59 | 123 | ||
60 | /*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ | 124 | /*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ |
@@ -119,19 +183,23 @@ static void setup_sm_control_11() { | |||
119 | fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res); | 183 | fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res); |
120 | } | 184 | } |
121 | 185 | ||
122 | // Common masking control | 186 | // Set default mask for all launches |
123 | void libsmctrl_set_global_mask(uint64_t mask) { | 187 | void libsmctrl_set_global_mask(uint64_t mask) { |
124 | int ver; | 188 | int ver; |
125 | cuDriverGetVersion(&ver); | 189 | cuDriverGetVersion(&ver); |
126 | if (ver <= 10020) { | 190 | if (ver == 10020) { |
127 | if (!g_sm_control) | 191 | if (!g_sm_control) |
128 | setup_sm_control_10(); | 192 | setup_g_sm_control_10(); |
129 | g_sm_control->mask = mask; | 193 | g_sm_control->mask = mask; |
130 | g_sm_control->enabled = 1; | 194 | g_sm_control->enabled = 1; |
131 | } else { | 195 | } else if (ver > 10020) { |
132 | if (!sm_control_setup_called) | 196 | if (!sm_control_setup_called) |
133 | setup_sm_control_11(); | 197 | setup_sm_control_11(); |
134 | g_sm_mask = mask; | 198 | g_sm_mask = mask; |
199 | } else { // < CUDA 10.2 | ||
200 | abort(1, ENOSYS, "Global masking requires at least CUDA 10.2; " | ||
201 | "this application is using CUDA %d.%d", | ||
202 | ver / 1000, (ver % 100)); | ||
135 | } | 203 | } |
136 | } | 204 | } |
137 | 205 | ||