diff options
| author | Joshua Bakita <jbakita@cs.unc.edu> | 2023-10-17 15:01:34 -0400 |
|---|---|---|
| committer | Joshua Bakita <jbakita@cs.unc.edu> | 2023-10-17 15:01:34 -0400 |
| commit | 2c4b2d784815c5a2b4c49592b912c043f3d2a954 (patch) | |
| tree | 60dea314e590ad93bbd32ce7ccb75d2b0beae73f | |
| parent | 977f7eb86bb028f00b1b51c4f8c515087d37632b (diff) | |
Support global masking on x86_64 and aarch64 with CUDA 10.2
Also improve documentation and abort with an error message if
attempting to set a global SM mask on an unsupported CUDA version.
(Would crash/corrupt state before.)
Also uncomment a line which errantly disabled global masking on
CUDA 10.2 on aarch64.
Tested with CUDA 10.2 on:
- x86_64 (GTX 1060 3GB, driver 440.100, jbakita-old.cs.unc.edu)
- aarch64 (Jetson TX2, driver r32.5, grizzly.cs.unc.edu)
| -rw-r--r-- | libsmctrl.c | 122 |
1 files changed, 95 insertions, 27 deletions
diff --git a/libsmctrl.c b/libsmctrl.c index 94578a1..dfd71b8 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
| @@ -6,12 +6,18 @@ | |||
| 6 | #include <cuda.h> | 6 | #include <cuda.h> |
| 7 | 7 | ||
| 8 | #include <errno.h> | 8 | #include <errno.h> |
| 9 | #include <error.h> | ||
| 9 | #include <fcntl.h> | 10 | #include <fcntl.h> |
| 10 | #include <stdint.h> | 11 | #include <stdint.h> |
| 11 | #include <stdio.h> | 12 | #include <stdio.h> |
| 12 | #include <unistd.h> | 13 | #include <unistd.h> |
| 13 | 14 | ||
| 14 | // Layout of mask control fields in CUDA's `globals` struct | 15 | // In functions that do not return an error code, we favor terminating with an |
| 16 | // error rather than merely printing a warning and continuing. | ||
| 17 | #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ | ||
| 18 | __VA_ARGS__) | ||
| 19 | |||
| 20 | // Layout of mask control fields to match CUDA's static global struct | ||
| 15 | struct global_sm_control { | 21 | struct global_sm_control { |
| 16 | uint32_t enabled; | 22 | uint32_t enabled; |
| 17 | uint64_t mask; | 23 | uint64_t mask; |
| @@ -19,42 +25,100 @@ struct global_sm_control { | |||
| 19 | 25 | ||
| 20 | /*** CUDA Globals Manipulation. CUDA 10.2 only ***/ | 26 | /*** CUDA Globals Manipulation. CUDA 10.2 only ***/ |
| 21 | 27 | ||
| 22 | // Ends up being 0x7fb7fa3408 in some binaries | 28 | // Ends up being 0x7fb7fa3408 in some binaries (CUDA 10.2, Jetson) |
| 23 | static struct global_sm_control* g_sm_control = NULL; | 29 | static struct global_sm_control* g_sm_control = NULL; |
| 24 | 30 | ||
| 25 | /* Find the location of CUDA's `globals` struct and the SM mask control fields | 31 | /* Find the location of CUDA's `globals` struct and the SM mask control fields |
| 26 | * No symbols are exported from within `globals`, so this has to do a very | 32 | * No symbols are exported from within `globals`, so this has to do a very |
| 27 | * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`. | 33 | * messy lookup, following the pattern of the assembly of `cuDeviceGetCount()`. |
| 28 | * Don't call this before the cuda library has been initialized. | 34 | * Don't call this before the CUDA library has been initialized. |
| 35 | * (Note that this appears to work, even if built on CUDA > 10.2.) | ||
| 29 | */ | 36 | */ |
| 30 | static void setup_sm_control_10() { | 37 | static void setup_g_sm_control_10() { |
| 31 | if (g_sm_control) | 38 | if (g_sm_control) |
| 32 | return; | 39 | return; |
| 33 | // Defeat relocation. cudbgReportDriverApiErrorFlags is relocated by | 40 | // The location of the static global struct containing the global SM |
| 34 | // the loader, but not subject to ASLR (it's always at a constant | 41 | // mask field will vary depending on where the loader locates the CUDA |
| 35 | // offset in the loaded instance of libcuda.so). Our target is also at | 42 | // library. In order to reliably modify this struct, we must defeat |
| 36 | // a constant offset, so we can use the address of | 43 | // that relocation by deriving its location relative to a known |
| 37 | // cudbgReportDriverApiErrorFlags as a reference point. | 44 | // reference point. |
| 38 | // Note: cudbgReportDriverApiErrorFlags is currently the closest known | 45 | // |
| 39 | // symbol to **the table**. cudbgDebuggerInitialized is the closest to | 46 | // == Choosing a Reference Point: |
| 40 | // globals itself (+7424 == SM mask control), but we perfer the table | 47 | // The cudbg* symbols appear to be relocated to a constant offset from |
| 41 | // lookup approach for now, as that's what cuDeviceGetCount() does. | 48 | // the globals structure, and so we use the address of the symbol |
| 49 | // `cudbgReportDriverApiErrorFlags` as our reference point. (This ends | ||
| 50 | // up being the closest to an intermediate table we use as part of our | ||
| 51 | // lookup---process discussed below.) | ||
| 42 | extern uint32_t cudbgReportDriverApiErrorFlags; | 52 | extern uint32_t cudbgReportDriverApiErrorFlags; |
| 43 | uint32_t* sym = 0;//&cudbgReportDriverApiErrorFlags; | 53 | uint32_t* sym = &cudbgReportDriverApiErrorFlags; |
| 44 | // In some binaries, the following works out to 0x7fb7ea6000, and | 54 | |
| 45 | // that's what shows up in the adrp instruction in cuDeviceGetCount() | 55 | // == Deriving Location: |
| 46 | // in the lead-up to get globals.numDevices. Find this offset by | 56 | // The number of CUDA devices available is co-located in the same CUDA |
| 47 | // calling cuDeviceGetCount(0xdeadbeef), catching the segfault in GDB, | 57 | // globals structure that we aim to modify the SM mask field in. The |
| 48 | // disassembling the prior instructions, taking the adrp constant, and | 58 | // value in that field can be assigned to a user-controlled pointer via |
| 49 | // subtracting the address of cudbgReportDriverApiErrorFlags from it. | 59 | // the cuDeviceGetCount() CUDA Driver Library function. To determine |
| 60 | // the location of thu structure, we pass a bad address to the function | ||
| 61 | // and dissasemble the code adjacent to where it segfaults. On the | ||
| 62 | // Jetson Xavier with CUDA 10.2, the assembly is as follows: | ||
| 63 | // (reg x19 contains cuDeviceGetCount()'s user-provided pointer) | ||
| 64 | // ... | ||
| 65 | // 0x0000007fb71454b4: cbz x19, 0x7fb71454d0 // Check ptr non-zero | ||
| 66 | // 0x0000007fb71454b8: adrp x1, 0x7fb7ea6000 // Addr of lookup tbl | ||
| 67 | // 0x0000007fb71454bc: ldr x1, [x1,#3672] // Get addr of globals | ||
| 68 | // 0x0000007fb71454c0: ldr w1, [x1,#904] // Get count from globals | ||
| 69 | // 0x0000007fb71454c4: str w1, [x19] // Store count at user addr | ||
| 70 | // ... | ||
| 71 | // In this assembly, we can identify that CUDA uses an internal lookup | ||
| 72 | // table to identify the location of the globals structure (pointer | ||
| 73 | // 459 in the table; offset 3672). After obtaining this pointer, it | ||
| 74 | // advances to offset 904 in the global structure, dereferences the | ||
| 75 | // value stored there, and then attempts to store it at the user- | ||
| 76 | // -provided address (register x19). This final line will trigger a | ||
| 77 | // segfault if a non-zero bad address is passed to cuDeviceGetCount(). | ||
| 78 | // | ||
| 79 | // On x86_64: | ||
| 80 | // (reg %rbx contains cuDeviceGetCount()'s user-provided pointer) | ||
| 81 | // ... | ||
| 82 | // 0x00007ffff6cac01f: test %rbx,%rbx // Check ptr non-zero | ||
| 83 | // 0x00007ffff6cac022: je 0x7ffff6cac038 // '' | ||
| 84 | // 0x00007ffff6cac024: mov 0x100451d(%rip),%rdx # 0x7ffff7cb0548 // Get globals base address from offset from instruction pointer | ||
| 85 | // 0x00007ffff6cac02b: mov 0x308(%rdx),%edx // Take globals base address, add an offset of 776, and dereference | ||
| 86 | // 0x00007ffff6cac031: mov %edx,(%rbx) // Store count at user addr | ||
| 87 | // ... | ||
| 88 | // Note that this does not use an intermediate lookup table. | ||
| 89 | // | ||
| 90 | // [Aside: cudbgReportDriverApiErrorFlags is currently the closest | ||
| 91 | // symbol to **the lookup table**. cudbgDebuggerInitialized is closer | ||
| 92 | // to the globals struct itself (+7424 == SM mask control), but we | ||
| 93 | // perfer the table lookup approach for now, as that's what | ||
| 94 | // cuDeviceGetCount() does.] | ||
| 95 | |||
| 96 | #if __aarch64__ | ||
| 97 | // In my test binary, the lookup table is at address 0x7fb7ea6000, and | ||
| 98 | // this is 1029868 bytes before the address for | ||
| 99 | // cudbgReportDriverApiErrorFlags. Use this information to derive the | ||
| 100 | // location of the lookup in our binary (defeat relocation). | ||
| 50 | uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868); | 101 | uintptr_t* tbl_base = (uintptr_t*)((uintptr_t)sym - 1029868); |
| 51 | // Address of `globals` is at offset 3672 (entry 459?) | 102 | // Address of `globals` is at offset 3672 (entry 459?) in the table |
| 52 | uintptr_t globals_addr = *(tbl_base + 459); // Offset 3672 on aarch64 | 103 | uintptr_t globals_addr = *(tbl_base + 459); |
| 53 | // SM mask control is at offset 4888 in the `globals` struct | 104 | // SM mask control is at offset 4888 in the `globals` struct |
| 105 | // [Device count at offset 904 (0x388)] | ||
| 54 | g_sm_control = (struct global_sm_control*)(globals_addr + 4888); | 106 | g_sm_control = (struct global_sm_control*)(globals_addr + 4888); |
| 107 | #endif // __aarch64__ | ||
| 108 | #if __x86_64__ | ||
| 109 | // In my test binary, globals is at 0x7ffff7cb0548, which is 1103576 | ||
| 110 | // bytes before the address for cudbgReportDriverApiErrorFlags | ||
| 111 | // (0x7ffff7dbdc20). Use this offset to defeat relocation. | ||
| 112 | uintptr_t globals_addr = *(uintptr_t*)((uintptr_t)sym - 1103576); | ||
| 113 | // SM mask control is at offset 4728 in the `globals` struct | ||
| 114 | // [Device count at offset 776 (0x308)] | ||
| 115 | g_sm_control = (struct global_sm_control*)(globals_addr + 4728); | ||
| 116 | #endif // __x86_64__ | ||
| 55 | // SM mask should be empty by default | 117 | // SM mask should be empty by default |
| 56 | if (g_sm_control->enabled || g_sm_control->mask) | 118 | if (g_sm_control->enabled || g_sm_control->mask) |
| 57 | fprintf(stderr, "Warning: Found non-NULL SM disable mask during setup! g_sm_control is likely invalid---use at own risk.\n"); | 119 | fprintf(stderr, "Warning: Found non-empty SM disable mask " |
| 120 | "during setup! libsmctrl_set_global_mask() is " | ||
| 121 | "unlikely to work on this platform!\n"); | ||
| 58 | } | 122 | } |
| 59 | 123 | ||
| 60 | /*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ | 124 | /*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ |
| @@ -119,19 +183,23 @@ static void setup_sm_control_11() { | |||
| 119 | fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res); | 183 | fprintf(stderr, "libsmctrl: Error enabling launch callback. Error %d\n", res); |
| 120 | } | 184 | } |
| 121 | 185 | ||
| 122 | // Common masking control | 186 | // Set default mask for all launches |
| 123 | void libsmctrl_set_global_mask(uint64_t mask) { | 187 | void libsmctrl_set_global_mask(uint64_t mask) { |
| 124 | int ver; | 188 | int ver; |
| 125 | cuDriverGetVersion(&ver); | 189 | cuDriverGetVersion(&ver); |
| 126 | if (ver <= 10020) { | 190 | if (ver == 10020) { |
| 127 | if (!g_sm_control) | 191 | if (!g_sm_control) |
| 128 | setup_sm_control_10(); | 192 | setup_g_sm_control_10(); |
| 129 | g_sm_control->mask = mask; | 193 | g_sm_control->mask = mask; |
| 130 | g_sm_control->enabled = 1; | 194 | g_sm_control->enabled = 1; |
| 131 | } else { | 195 | } else if (ver > 10020) { |
| 132 | if (!sm_control_setup_called) | 196 | if (!sm_control_setup_called) |
| 133 | setup_sm_control_11(); | 197 | setup_sm_control_11(); |
| 134 | g_sm_mask = mask; | 198 | g_sm_mask = mask; |
| 199 | } else { // < CUDA 10.2 | ||
| 200 | abort(1, ENOSYS, "Global masking requires at least CUDA 10.2; " | ||
| 201 | "this application is using CUDA %d.%d", | ||
| 202 | ver / 1000, (ver % 100)); | ||
| 135 | } | 203 | } |
| 136 | } | 204 | } |
| 137 | 205 | ||
