aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libsmctrl.c49
1 files changed, 26 insertions, 23 deletions
diff --git a/libsmctrl.c b/libsmctrl.c
index 09fc627..5ee94fb 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -1,5 +1,5 @@
1/** 1/**
2 * Copyright 2022-2024 Joshua Bakita 2 * Copyright 2022-2025 Joshua Bakita
3 * Library to control SM masks on CUDA launches. Co-opts preexisting debug 3 * Library to control SM masks on CUDA launches. Co-opts preexisting debug
4 * logic in the CUDA driver library, and thus requires a build with -lcuda. 4 * logic in the CUDA driver library, and thus requires a build with -lcuda.
5 * 5 *
@@ -10,7 +10,7 @@
10 * +-----------+---------------+---------------+--------------+ 10 * +-----------+---------------+---------------+--------------+
11 * | Version | Global Mask | Stream Mask | Next Mask | 11 * | Version | Global Mask | Stream Mask | Next Mask |
12 * +-----------+---------------+---------------+--------------+ 12 * +-----------+---------------+---------------+--------------+
13 * | 8.0-12.6 | TMD/QMD Hook | stream struct | TMD/QMD Hook | 13 * | 8.0-12.8 | TMD/QMD Hook | stream struct | TMD/QMD Hook |
14 * | 6.5-7.5 | TMD/QMD Hook | N/A | TMD/QMD Hook | 14 * | 6.5-7.5 | TMD/QMD Hook | N/A | TMD/QMD Hook |
15 * +-----------+---------------+---------------+--------------+ 15 * +-----------+---------------+---------------+--------------+
16 * "N/A" indicates that a mask type is unsupported on that CUDA version. 16 * "N/A" indicates that a mask type is unsupported on that CUDA version.
@@ -119,10 +119,23 @@ static void setup_sm_control_callback() {
119 int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid); 119 int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid);
120 uintptr_t* tbl_base; 120 uintptr_t* tbl_base;
121 uint32_t my_hndl; 121 uint32_t my_hndl;
122 // Avoid race conditions (setup can only be called once) 122 // Avoid race conditions (setup should only run once)
123 if (__atomic_test_and_set(&sm_control_setup_called, __ATOMIC_SEQ_CST)) 123 if (__atomic_test_and_set(&sm_control_setup_called, __ATOMIC_SEQ_CST))
124 return; 124 return;
125 125
126#if CUDA_VERSION <= 6050
127 // Verify supported CUDA version
128 // It's impossible for us to run with a version of CUDA older than we were
129 // built by, so this check is excluded if built with CUDA > 6.5.
130 int ver = 0;
131 cuDriverGetVersion(&ver);
132 if (ver < 6050)
133 abort(1, ENOSYS, "Global or next masking requires at least CUDA 6.5; "
134 "this application is using CUDA %d.%d",
135 ver / 1000, (ver % 100));
136#endif
137
138 // Set up callback
126 cuGetExportTable((const void**)&tbl_base, &callback_funcs_id); 139 cuGetExportTable((const void**)&tbl_base, &callback_funcs_id);
127 uintptr_t subscribe_func_addr = *(tbl_base + 3); 140 uintptr_t subscribe_func_addr = *(tbl_base + 3);
128 uintptr_t enable_func_addr = *(tbl_base + 6); 141 uintptr_t enable_func_addr = *(tbl_base + 6);
@@ -139,31 +152,13 @@ static void setup_sm_control_callback() {
139 152
140// Set default mask for all launches 153// Set default mask for all launches
141void libsmctrl_set_global_mask(uint64_t mask) { 154void libsmctrl_set_global_mask(uint64_t mask) {
142 if (!sm_control_setup_called) { 155 setup_sm_control_callback();
143 // The version will not change while running, so only check once
144 int ver = 0;
145 cuDriverGetVersion(&ver);
146 if (ver < 6050)
147 abort(1, ENOSYS, "Global masking requires at least CUDA 6.5; "
148 "this application is using CUDA %d.%d",
149 ver / 1000, (ver % 100));
150 setup_sm_control_callback();
151 }
152 g_sm_mask = mask; 156 g_sm_mask = mask;
153} 157}
154 158
155// Set mask for next launch from this thread 159// Set mask for next launch from this thread
156void libsmctrl_set_next_mask(uint64_t mask) { 160void libsmctrl_set_next_mask(uint64_t mask) {
157 if (!sm_control_setup_called) { 161 setup_sm_control_callback();
158 // The version will not change while running, so only check once
159 int ver = 0;
160 cuDriverGetVersion(&ver);
161 if (ver < 6050)
162 abort(1, ENOSYS, "Next masking requires at least CUDA 6.5; "
163 "this application is using CUDA %d.%d",
164 ver / 1000, (ver % 100));
165 setup_sm_control_callback();
166 }
167 g_next_sm_mask = mask; 162 g_next_sm_mask = mask;
168} 163}
169 164
@@ -204,6 +199,10 @@ void libsmctrl_set_next_mask(uint64_t mask) {
204// CUDA 12.5 and 12.6 use the same offset 199// CUDA 12.5 and 12.6 use the same offset
205// 12.5 tested on 555.58.02 200// 12.5 tested on 555.58.02
206// 12.6 tested on 560.35.03 201// 12.6 tested on 560.35.03
202#define CU_12_7_MASK_OFF 0x4fc
203// CUDA 12.7 and 12.8 use the same offset
204// 12.7 tested on 565.77
205// 12.8 tested on 570.124.06
207 206
208// Offsets for the stream struct on Jetson aarch64 207// Offsets for the stream struct on Jetson aarch64
209#define CU_9_0_MASK_OFF_JETSON 0x128 208#define CU_9_0_MASK_OFF_JETSON 0x128
@@ -334,6 +333,10 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
334 case 12060: 333 case 12060:
335 hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF); 334 hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF);
336 break; 335 break;
336 case 12070:
337 case 12080:
338 hw_mask_v2 = (void*)(stream_struct_base + CU_12_7_MASK_OFF);
339 break;
337#elif __aarch64__ 340#elif __aarch64__
338 case 9000: { 341 case 9000: {
339 // Jetson TX2 offset is slightly different on CUDA 9.0. 342 // Jetson TX2 offset is slightly different on CUDA 9.0.