diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2024-02-21 14:05:34 -0500 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2024-02-21 14:05:34 -0500 |
commit | 5c65954998591bc61a4138024ba4895bed64a8a6 (patch) | |
tree | 08a62949fc91a298d0a7e00c6b87c8213f86fe1a | |
parent | 2accc2be54d3f9ad20d15f21bca6397ef6cabf92 (diff) |
Fix timestamping bugs and clean up CPU side of mon_cross_ctx_copies
- Add a more reliable heuristic for how long to run the CPU-side
copy monitor.
- Check that copies sizes are appropriately divisible.
- Fix parsing issue for copies larger than 8192 GiB.
- Clean up, and document bugs in, the time synchronization code.
- Wait for user before logging copies.
-rw-r--r-- | copy_experiments/mon_cross_ctx_copies.cu | 78 |
1 files changed, 57 insertions, 21 deletions
diff --git a/copy_experiments/mon_cross_ctx_copies.cu b/copy_experiments/mon_cross_ctx_copies.cu index e836179..1163b95 100644 --- a/copy_experiments/mon_cross_ctx_copies.cu +++ b/copy_experiments/mon_cross_ctx_copies.cu | |||
@@ -7,9 +7,20 @@ | |||
7 | * - GPU clock ticks at a constant rate | 7 | * - GPU clock ticks at a constant rate |
8 | * - GPU clock ticks even while GPU is idle | 8 | * - GPU clock ticks even while GPU is idle |
9 | * - Different contexts view the same underlying GPU clock | 9 | * - Different contexts view the same underlying GPU clock |
10 | 10 | * - GPU and CPU clocks tick at the same rate | |
11 | WARNING: By default, assumes that GPU and CPU clocks tick at the same rate. | 11 | * |
12 | * BUGS: | ||
13 | * - GPU and CPU clocks *do not* tick at the same rate | ||
14 | * - CPU clocks may be at different offsets and rates on different cores | ||
15 | * TODO: | ||
16 | * - Disable migrations while consistent timestamps are needed | ||
17 | * - When using GPU monitoring, synchronize clocks in each thread, and | ||
18 | * do this serially | ||
19 | * - Support configuring copy direction | ||
12 | * | 20 | * |
21 | * Note that only CPU-side monitoring is used in the RTAS'24 paper, so this | ||
22 | * tool is still correct and known-bug-free for the purposes of artifact | ||
23 | * evaluation. | ||
13 | */ | 24 | */ |
14 | #include "copy_testbench.h" | 25 | #include "copy_testbench.h" |
15 | #include "../task_host_utilities.cu" | 26 | #include "../task_host_utilities.cu" |
@@ -110,7 +121,6 @@ void* copy_thread(void* args_raw) { | |||
110 | 121 | ||
111 | uint64_t dev_ns, dev_ns2; | 122 | uint64_t dev_ns, dev_ns2; |
112 | double host_s, host_s2; | 123 | double host_s, host_s2; |
113 | //GetHostDeviceTimeOffset(dev, &host_s, &dev_ns); | ||
114 | if (GPU_COMPUTE && SKEW_CHECK) | 124 | if (GPU_COMPUTE && SKEW_CHECK) |
115 | InternalReadGPUNanoseconds(dev, &host_s, &dev_ns); | 125 | InternalReadGPUNanoseconds(dev, &host_s, &dev_ns); |
116 | 126 | ||
@@ -147,11 +157,18 @@ void* copy_thread(void* args_raw) { | |||
147 | for (int i = 0; i < GPU_MON_THREADS; i++) | 157 | for (int i = 0; i < GPU_MON_THREADS; i++) |
148 | ready &= barrier[i]; | 158 | ready &= barrier[i]; |
149 | } | 159 | } |
160 | } else { | ||
161 | if ((COPY_SIZE / PG_SZ) % CPU_MON_DIVISOR != 0) { | ||
162 | fprintf(stderr, "copy_size must be divisible by %d when using CPU-monitored copy threads.\n", CPU_MON_DIVISOR); | ||
163 | exit(1); | ||
164 | } | ||
150 | } | 165 | } |
151 | 166 | ||
152 | // Tell our parent we're ready | 167 | // Tell our parent we're ready |
153 | args->is_ready = 1; | 168 | args->is_ready = 1; |
154 | // Wait for our parent to tell us to go | 169 | // Wait for our parent to tell us to go (spinning here should also cause |
170 | // Linux's load-balancing logic to implictly move each monitoring thread to | ||
171 | // a separate core) | ||
155 | while (!READY) | 172 | while (!READY) |
156 | continue; | 173 | continue; |
157 | 174 | ||
@@ -176,8 +193,10 @@ void* copy_thread(void* args_raw) { | |||
176 | // Wait for copy monitor (and hence copy) to complete | 193 | // Wait for copy monitor (and hence copy) to complete |
177 | SAFE(cudaStreamSynchronize(stream2)); | 194 | SAFE(cudaStreamSynchronize(stream2)); |
178 | } else { | 195 | } else { |
179 | // Guess number of needed CPU-only monitoring cycles | 196 | // Guess number of needed CPU-only monitoring cycles (this heuristic |
180 | cpu_copy_mon(COPY_SIZE / PG_SZ, pinned_hostmem); | 197 | // ensures that monitoring runs for roughly the same amount of time, |
198 | // no matter the timestamping granularity). | ||
199 | cpu_copy_mon(CPU_MON_DIVISOR * COPY_SIZE / PG_SZ, pinned_hostmem); | ||
181 | // Make sure that the copy finished in case we guessed small | 200 | // Make sure that the copy finished in case we guessed small |
182 | SAFE(cudaStreamSynchronize(stream1)); | 201 | SAFE(cudaStreamSynchronize(stream1)); |
183 | } | 202 | } |
@@ -229,7 +248,7 @@ static error_t arg_parser(int key, char* arg, struct argp_state *state) { | |||
229 | case 's': | 248 | case 's': |
230 | if (atol(arg) < CPU_MON_DIVISOR * 2) | 249 | if (atol(arg) < CPU_MON_DIVISOR * 2) |
231 | argp_error(state, "Please specify a larger copy size. It must be at least %d pages for accurate tracking.\n", CPU_MON_DIVISOR * 2); | 250 | argp_error(state, "Please specify a larger copy size. It must be at least %d pages for accurate tracking.\n", CPU_MON_DIVISOR * 2); |
232 | COPY_SIZE = atol(arg) * PG_SZ; | 251 | COPY_SIZE = strtoull(arg, NULL, 0) * PG_SZ; |
233 | break; | 252 | break; |
234 | case 'g': | 253 | case 'g': |
235 | if (g_args->num_threads == MAX_THREADS) | 254 | if (g_args->num_threads == MAX_THREADS) |
@@ -254,6 +273,16 @@ int main(int argc, char**argv) { | |||
254 | pthread_t t[MAX_THREADS]; | 273 | pthread_t t[MAX_THREADS]; |
255 | global_args_t g_args = {0}; | 274 | global_args_t g_args = {0}; |
256 | 275 | ||
276 | struct argp argp = {opts, arg_parser, 0, desc}; | ||
277 | argp_parse(&argp, argc, argv, 0, 0, &g_args); | ||
278 | |||
279 | if (g_args.num_threads == 0) { | ||
280 | fprintf(stderr, "At least one copy thread must be specified with --gpu or --cpu arguments.\n"); | ||
281 | return 3; | ||
282 | } | ||
283 | |||
284 | fprintf(stderr, "(%d) Synchronizing clocks and initializing copy threads...\n", getpid()); | ||
285 | |||
257 | // Temporarially initialize CUDA to query device attributes | 286 | // Temporarially initialize CUDA to query device attributes |
258 | SAFE_D(cuInit(0)); | 287 | SAFE_D(cuInit(0)); |
259 | SAFE_D(cuDeviceGet(&dev_itrl, dev)); | 288 | SAFE_D(cuDeviceGet(&dev_itrl, dev)); |
@@ -261,7 +290,7 @@ int main(int argc, char**argv) { | |||
261 | /// XXX: Still seems to work fine if it isn't??? | 290 | /// XXX: Still seems to work fine if it isn't??? |
262 | SAFE_D(cuDeviceGetAttribute(&tmp, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev_itrl)); | 291 | SAFE_D(cuDeviceGetAttribute(&tmp, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev_itrl)); |
263 | if (!tmp) { | 292 | if (!tmp) { |
264 | fprintf(stderr, "Unsupported platform. It must be possible to map host (CPU)" | 293 | fprintf(stderr, "Unsupported GPU. It must be possible to map host (CPU)" |
265 | " DRAM into the GPU virtual address space for accurate clock " | 294 | " DRAM into the GPU virtual address space for accurate clock " |
266 | "synchronization. Exiting...\n"); | 295 | "synchronization. Exiting...\n"); |
267 | return 1; | 296 | return 1; |
@@ -270,22 +299,25 @@ int main(int argc, char**argv) { | |||
270 | // reused in subprocesses | 299 | // reused in subprocesses |
271 | SAFE_D(cuDevicePrimaryCtxRelease(dev_itrl)); | 300 | SAFE_D(cuDevicePrimaryCtxRelease(dev_itrl)); |
272 | 301 | ||
273 | struct argp argp = {opts, arg_parser, 0, desc}; | 302 | double d2h_scale, host_s; |
274 | argp_parse(&argp, argc, argv, 0, 0, &g_args); | ||
275 | |||
276 | if (g_args.num_threads == 0) { | ||
277 | fprintf(stderr, "At least one copy thread must be specified with --gpu or --cpu arguments.\n"); | ||
278 | return 3; | ||
279 | } | ||
280 | |||
281 | // TODO: Rewrite this so that it's all (u)int64_t | ||
282 | // Skip, as this is 1-to-1 on GV100 and GV11B | ||
283 | //double d2h_scale = GetGPUTimerScale(dev); | ||
284 | // Get the offset of GPU time from CPU time | ||
285 | uint64_t dev_ns; | 303 | uint64_t dev_ns; |
286 | double host_s; | 304 | // Get the core-specific offset of GPU time from CPU time, and the |
305 | // core-specific difference in tick rates (typical variance of -13 to 60 | ||
306 | // microseconds per second). | ||
307 | // XXX: This is not sufficient for time synchronization, see "BUGS" at the | ||
308 | // top of this file. | ||
309 | // XXX: This creates an implict context, but should reuse the above. | ||
310 | d2h_scale = InternalGetGPUTimerScale(dev); | ||
287 | InternalReadGPUNanoseconds(dev, &host_s, &dev_ns); | 311 | InternalReadGPUNanoseconds(dev, &host_s, &dev_ns); |
312 | if (d2h_scale == -1 || (host_s == 0 && !dev_ns)) { | ||
313 | fprintf(stderr, "Unabled to synchronize time with the GPU. Aborting...\n"); | ||
314 | return 1; | ||
315 | } | ||
288 | GPU_TIME_OFFSET = dev_ns - s2ns(host_s); | 316 | GPU_TIME_OFFSET = dev_ns - s2ns(host_s); |
317 | // Necessary to synchronize experiments running on different CPU cores | ||
318 | // (as CPU clocks are only semi-synchronized) | ||
319 | fprintf(stderr, "(%d) CPU clock - GPU clock: %ld tick gap\n", getpid(), (long)s2ns(host_s) - dev_ns); | ||
320 | fprintf(stderr, "(%d) 1 CPU tick/1 GPU tick: %.9f\n", getpid(), d2h_scale); | ||
289 | 321 | ||
290 | // Copy buffers are filled with random numbers. Seed the RNG. | 322 | // Copy buffers are filled with random numbers. Seed the RNG. |
291 | srand(0); | 323 | srand(0); |
@@ -295,8 +327,12 @@ int main(int argc, char**argv) { | |||
295 | while (!g_args.thread_args[tid].is_ready) | 327 | while (!g_args.thread_args[tid].is_ready) |
296 | continue; | 328 | continue; |
297 | } | 329 | } |
330 | |||
331 | fprintf(stderr, "(%d) Initialization completed. Press enter to start copies...", getpid()); | ||
332 | getc(stdin); // Wait for user | ||
298 | // Tell children initialization is done and that they can go | 333 | // Tell children initialization is done and that they can go |
299 | READY = 1; | 334 | READY = 1; |
335 | |||
300 | // Wait for threads to finish, and determine the earliest recorded time | 336 | // Wait for threads to finish, and determine the earliest recorded time |
301 | uint64_t smallest = UINT64_MAX; | 337 | uint64_t smallest = UINT64_MAX; |
302 | for (int tid = 0; tid < g_args.num_threads; tid++) { | 338 | for (int tid = 0; tid < g_args.num_threads; tid++) { |