summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2024-02-21 14:05:34 -0500
committerJoshua Bakita <bakitajoshua@gmail.com>2024-02-21 14:05:34 -0500
commit5c65954998591bc61a4138024ba4895bed64a8a6 (patch)
tree08a62949fc91a298d0a7e00c6b87c8213f86fe1a
parent2accc2be54d3f9ad20d15f21bca6397ef6cabf92 (diff)
Fix timestamping bugs and clean up CPU side of mon_cross_ctx_copies
- Add a more reliable heuristic for how long to run the CPU-side copy monitor. - Check that copies sizes are appropriately divisible. - Fix parsing issue for copies larger than 8192 GiB. - Clean up, and document bugs in, the time synchronization code. - Wait for user before logging copies.
-rw-r--r--copy_experiments/mon_cross_ctx_copies.cu78
1 files changed, 57 insertions, 21 deletions
diff --git a/copy_experiments/mon_cross_ctx_copies.cu b/copy_experiments/mon_cross_ctx_copies.cu
index e836179..1163b95 100644
--- a/copy_experiments/mon_cross_ctx_copies.cu
+++ b/copy_experiments/mon_cross_ctx_copies.cu
@@ -7,9 +7,20 @@
7 * - GPU clock ticks at a constant rate 7 * - GPU clock ticks at a constant rate
8 * - GPU clock ticks even while GPU is idle 8 * - GPU clock ticks even while GPU is idle
9 * - Different contexts view the same underlying GPU clock 9 * - Different contexts view the same underlying GPU clock
10 10 * - GPU and CPU clocks tick at the same rate
11WARNING: By default, assumes that GPU and CPU clocks tick at the same rate. 11 *
12 * BUGS:
13 * - GPU and CPU clocks *do not* tick at the same rate
14 * - CPU clocks may be at different offsets and rates on different cores
15 * TODO:
16 * - Disable migrations while consistent timestamps are needed
17 * - When using GPU monitoring, synchronize clocks in each thread, and
18 * do this serially
19 * - Support configuring copy direction
12 * 20 *
21 * Note that only CPU-side monitoring is used in the RTAS'24 paper, so this
22 * tool is still correct and known-bug-free for the purposes of artifact
23 * evaluation.
13 */ 24 */
14#include "copy_testbench.h" 25#include "copy_testbench.h"
15#include "../task_host_utilities.cu" 26#include "../task_host_utilities.cu"
@@ -110,7 +121,6 @@ void* copy_thread(void* args_raw) {
110 121
111 uint64_t dev_ns, dev_ns2; 122 uint64_t dev_ns, dev_ns2;
112 double host_s, host_s2; 123 double host_s, host_s2;
113 //GetHostDeviceTimeOffset(dev, &host_s, &dev_ns);
114 if (GPU_COMPUTE && SKEW_CHECK) 124 if (GPU_COMPUTE && SKEW_CHECK)
115 InternalReadGPUNanoseconds(dev, &host_s, &dev_ns); 125 InternalReadGPUNanoseconds(dev, &host_s, &dev_ns);
116 126
@@ -147,11 +157,18 @@ void* copy_thread(void* args_raw) {
147 for (int i = 0; i < GPU_MON_THREADS; i++) 157 for (int i = 0; i < GPU_MON_THREADS; i++)
148 ready &= barrier[i]; 158 ready &= barrier[i];
149 } 159 }
160 } else {
161 if ((COPY_SIZE / PG_SZ) % CPU_MON_DIVISOR != 0) {
162 fprintf(stderr, "copy_size must be divisible by %d when using CPU-monitored copy threads.\n", CPU_MON_DIVISOR);
163 exit(1);
164 }
150 } 165 }
151 166
152 // Tell our parent we're ready 167 // Tell our parent we're ready
153 args->is_ready = 1; 168 args->is_ready = 1;
154 // Wait for our parent to tell us to go 169 // Wait for our parent to tell us to go (spinning here should also cause
170 // Linux's load-balancing logic to implictly move each monitoring thread to
171 // a separate core)
155 while (!READY) 172 while (!READY)
156 continue; 173 continue;
157 174
@@ -176,8 +193,10 @@ void* copy_thread(void* args_raw) {
176 // Wait for copy monitor (and hence copy) to complete 193 // Wait for copy monitor (and hence copy) to complete
177 SAFE(cudaStreamSynchronize(stream2)); 194 SAFE(cudaStreamSynchronize(stream2));
178 } else { 195 } else {
179 // Guess number of needed CPU-only monitoring cycles 196 // Guess number of needed CPU-only monitoring cycles (this heuristic
180 cpu_copy_mon(COPY_SIZE / PG_SZ, pinned_hostmem); 197 // ensures that monitoring runs for roughly the same amount of time,
198 // no matter the timestamping granularity).
199 cpu_copy_mon(CPU_MON_DIVISOR * COPY_SIZE / PG_SZ, pinned_hostmem);
181 // Make sure that the copy finished in case we guessed small 200 // Make sure that the copy finished in case we guessed small
182 SAFE(cudaStreamSynchronize(stream1)); 201 SAFE(cudaStreamSynchronize(stream1));
183 } 202 }
@@ -229,7 +248,7 @@ static error_t arg_parser(int key, char* arg, struct argp_state *state) {
229 case 's': 248 case 's':
230 if (atol(arg) < CPU_MON_DIVISOR * 2) 249 if (atol(arg) < CPU_MON_DIVISOR * 2)
231 argp_error(state, "Please specify a larger copy size. It must be at least %d pages for accurate tracking.\n", CPU_MON_DIVISOR * 2); 250 argp_error(state, "Please specify a larger copy size. It must be at least %d pages for accurate tracking.\n", CPU_MON_DIVISOR * 2);
232 COPY_SIZE = atol(arg) * PG_SZ; 251 COPY_SIZE = strtoull(arg, NULL, 0) * PG_SZ;
233 break; 252 break;
234 case 'g': 253 case 'g':
235 if (g_args->num_threads == MAX_THREADS) 254 if (g_args->num_threads == MAX_THREADS)
@@ -254,6 +273,16 @@ int main(int argc, char**argv) {
254 pthread_t t[MAX_THREADS]; 273 pthread_t t[MAX_THREADS];
255 global_args_t g_args = {0}; 274 global_args_t g_args = {0};
256 275
276 struct argp argp = {opts, arg_parser, 0, desc};
277 argp_parse(&argp, argc, argv, 0, 0, &g_args);
278
279 if (g_args.num_threads == 0) {
280 fprintf(stderr, "At least one copy thread must be specified with --gpu or --cpu arguments.\n");
281 return 3;
282 }
283
284 fprintf(stderr, "(%d) Synchronizing clocks and initializing copy threads...\n", getpid());
285
257 // Temporarially initialize CUDA to query device attributes 286 // Temporarially initialize CUDA to query device attributes
258 SAFE_D(cuInit(0)); 287 SAFE_D(cuInit(0));
259 SAFE_D(cuDeviceGet(&dev_itrl, dev)); 288 SAFE_D(cuDeviceGet(&dev_itrl, dev));
@@ -261,7 +290,7 @@ int main(int argc, char**argv) {
261 /// XXX: Still seems to work fine if it isn't??? 290 /// XXX: Still seems to work fine if it isn't???
262 SAFE_D(cuDeviceGetAttribute(&tmp, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev_itrl)); 291 SAFE_D(cuDeviceGetAttribute(&tmp, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev_itrl));
263 if (!tmp) { 292 if (!tmp) {
264 fprintf(stderr, "Unsupported platform. It must be possible to map host (CPU)" 293 fprintf(stderr, "Unsupported GPU. It must be possible to map host (CPU)"
265 " DRAM into the GPU virtual address space for accurate clock " 294 " DRAM into the GPU virtual address space for accurate clock "
266 "synchronization. Exiting...\n"); 295 "synchronization. Exiting...\n");
267 return 1; 296 return 1;
@@ -270,22 +299,25 @@ int main(int argc, char**argv) {
270 // reused in subprocesses 299 // reused in subprocesses
271 SAFE_D(cuDevicePrimaryCtxRelease(dev_itrl)); 300 SAFE_D(cuDevicePrimaryCtxRelease(dev_itrl));
272 301
273 struct argp argp = {opts, arg_parser, 0, desc}; 302 double d2h_scale, host_s;
274 argp_parse(&argp, argc, argv, 0, 0, &g_args);
275
276 if (g_args.num_threads == 0) {
277 fprintf(stderr, "At least one copy thread must be specified with --gpu or --cpu arguments.\n");
278 return 3;
279 }
280
281 // TODO: Rewrite this so that it's all (u)int64_t
282 // Skip, as this is 1-to-1 on GV100 and GV11B
283 //double d2h_scale = GetGPUTimerScale(dev);
284 // Get the offset of GPU time from CPU time
285 uint64_t dev_ns; 303 uint64_t dev_ns;
286 double host_s; 304 // Get the core-specific offset of GPU time from CPU time, and the
305 // core-specific difference in tick rates (typical variance of -13 to 60
306 // microseconds per second).
307 // XXX: This is not sufficient for time synchronization, see "BUGS" at the
308 // top of this file.
309 // XXX: This creates an implict context, but should reuse the above.
310 d2h_scale = InternalGetGPUTimerScale(dev);
287 InternalReadGPUNanoseconds(dev, &host_s, &dev_ns); 311 InternalReadGPUNanoseconds(dev, &host_s, &dev_ns);
312 if (d2h_scale == -1 || (host_s == 0 && !dev_ns)) {
313 fprintf(stderr, "Unabled to synchronize time with the GPU. Aborting...\n");
314 return 1;
315 }
288 GPU_TIME_OFFSET = dev_ns - s2ns(host_s); 316 GPU_TIME_OFFSET = dev_ns - s2ns(host_s);
317 // Necessary to synchronize experiments running on different CPU cores
318 // (as CPU clocks are only semi-synchronized)
319 fprintf(stderr, "(%d) CPU clock - GPU clock: %ld tick gap\n", getpid(), (long)s2ns(host_s) - dev_ns);
320 fprintf(stderr, "(%d) 1 CPU tick/1 GPU tick: %.9f\n", getpid(), d2h_scale);
289 321
290 // Copy buffers are filled with random numbers. Seed the RNG. 322 // Copy buffers are filled with random numbers. Seed the RNG.
291 srand(0); 323 srand(0);
@@ -295,8 +327,12 @@ int main(int argc, char**argv) {
295 while (!g_args.thread_args[tid].is_ready) 327 while (!g_args.thread_args[tid].is_ready)
296 continue; 328 continue;
297 } 329 }
330
331 fprintf(stderr, "(%d) Initialization completed. Press enter to start copies...", getpid());
332 getc(stdin); // Wait for user
298 // Tell children initialization is done and that they can go 333 // Tell children initialization is done and that they can go
299 READY = 1; 334 READY = 1;
335
300 // Wait for threads to finish, and determine the earliest recorded time 336 // Wait for threads to finish, and determine the earliest recorded time
301 uint64_t smallest = UINT64_MAX; 337 uint64_t smallest = UINT64_MAX;
302 for (int tid = 0; tid < g_args.num_threads; tid++) { 338 for (int tid = 0; tid < g_args.num_threads; tid++) {