summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2024-02-20 16:03:38 -0500
committerJoshua Bakita <bakitajoshua@gmail.com>2024-02-20 16:03:38 -0500
commit2accc2be54d3f9ad20d15f21bca6397ef6cabf92 (patch)
tree34f9d054f798173bdb3c610eadd2e17d201fb01b
parentd90826c1cc5f03fdc0aaef5bf20c57aec6556940 (diff)
Rewrite clock rate synchronization code to increase accuracy
Mirrors architecture used for instantaneous clock synchronization. The PPM (parts-per-million) error after a two-second synchronization period (using results from new implementation after a 100-second synchronization period as ground truth, as after 100 seconds, the new and old implementation disagree by at most 2 PPM). GPU (machine name) Before After ------------------------------ ------ ------ GTX 1060 3 GiB (jbakita-old): 12 1 GTX 970 (jbakita-old): 9 2 GTX 1080 Ti (yamaha): 5 1 RTX 6000 Ada (yamaha): 4 2 (An error of 1 PPM is 1 microsecond per second.) Also modify `preemption_logger` to print the rate of skew as PPM, rather than just as a multiplier.
-rw-r--r--preemption_logger.cu11
-rw-r--r--task_host_utilities.cu97
2 files changed, 82 insertions, 26 deletions
diff --git a/preemption_logger.cu b/preemption_logger.cu
index 89b348a..ec3c22e 100644
--- a/preemption_logger.cu
+++ b/preemption_logger.cu
@@ -90,10 +90,11 @@ int main(int argc, char **argv) {
90 // Synchronize the GPU and CPU clocks (if requested) using the utilities 90 // Synchronize the GPU and CPU clocks (if requested) using the utilities
91 // from task_host_utilities.cu. 91 // from task_host_utilities.cu.
92 if (!skip_conversion) { 92 if (!skip_conversion) {
93 // Note that this appears to almost always be 1.0 on recent GPUs 93 // The skew is between -13 and 60 microseconds per second on the GTX
94 d2h_scale = GetGPUTimerScale(0); 94 // 1080, GTX 1060 3 GiB, GTX 970, RTX 6000 Ada, and GTX 1080 Ti.
95 d2h_scale = InternalGetGPUTimerScale(0);
95 InternalReadGPUNanoseconds(0, &host_s, &dev_ns); 96 InternalReadGPUNanoseconds(0, &host_s, &dev_ns);
96 if (host_s == 0 && !dev_ns) { 97 if (d2h_scale == -1 || (host_s == 0 && !dev_ns)) {
97 fprintf(stderr, "Unable to synchronize time with the GPU. Aborting...\n"); 98 fprintf(stderr, "Unable to synchronize time with the GPU. Aborting...\n");
98 return 1; 99 return 1;
99 } 100 }
@@ -153,7 +154,9 @@ int main(int argc, char **argv) {
153 fprintf(stderr, "(%d) Aprox launch overhead: %ld ns\n", pid, launch_oh); 154 fprintf(stderr, "(%d) Aprox launch overhead: %ld ns\n", pid, launch_oh);
154 fprintf(stderr, "(%d) CPU clock - GPU clock: %ld tick gap\n", pid, 155 fprintf(stderr, "(%d) CPU clock - GPU clock: %ld tick gap\n", pid,
155 (long)s2ns(host_s) - dev_ns); 156 (long)s2ns(host_s) - dev_ns);
156 fprintf(stderr, "(%d) 1 CPU tick/1 GPU tick: %.3f\n", pid, d2h_scale); 157 fprintf(stderr, "(%d) After 1 second, the GPU clock is %.f us %s (%.9fx d2h)\n",
158 pid, fabs((d2h_scale - 1) * 1e6),
159 d2h_scale > 1 ? "behind" : "ahead", d2h_scale);
157 } 160 }
158 161
159 return 0; 162 return 0;
diff --git a/task_host_utilities.cu b/task_host_utilities.cu
index 1f76080..1afdaf0 100644
--- a/task_host_utilities.cu
+++ b/task_host_utilities.cu
@@ -40,6 +40,17 @@ static double CurrentSeconds(void) {
40 return ((double) ts.tv_sec) + (((double) ts.tv_nsec) / 1e9); 40 return ((double) ts.tv_sec) + (((double) ts.tv_nsec) / 1e9);
41} 41}
42 42
43// Note that CLOCK_MONOTONIC_RAW is slow before Linux 5.3 because it's not
44// supported by vDSO.
45static uint64_t CurrentNanoseconds(void) {
46 struct timespec ts;
47 if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts) != 0) {
48 printf("Error getting time.\n");
49 exit(1);
50 }
51 return ((uint64_t) ts.tv_sec) * 1000*1000*1000 + ((uint64_t) ts.tv_nsec);
52}
53
43// GlobalTimer64: Get 64-bit counter of current time on GPU 54// GlobalTimer64: Get 64-bit counter of current time on GPU
44// ***This is duplicated in benchmark_gpu_utilities.h*** 55// ***This is duplicated in benchmark_gpu_utilities.h***
45#if __CUDA_ARCH__ >= 300 // Kepler+ 56#if __CUDA_ARCH__ >= 300 // Kepler+
@@ -299,7 +310,7 @@ int GetMaxResidentThreads(int cuda_device) {
299} 310}
300 311
301#if __CUDA_ARCH__ >= 300 // Kepler+ 312#if __CUDA_ARCH__ >= 300 // Kepler+
302static __global__ void TimerSpin(uint64_t ns_to_spin) { 313static __device__ void TimerSpin(uint64_t ns_to_spin) {
303 uint64_t start_time = GlobalTimer64(); 314 uint64_t start_time = GlobalTimer64();
304 while ((GlobalTimer64() - start_time) < ns_to_spin) { 315 while ((GlobalTimer64() - start_time) < ns_to_spin) {
305 continue; 316 continue;
@@ -314,7 +325,7 @@ static __device__ inline uint32_t Clock32(void) {
314} 325}
315 326
316// 'clock' can easily roll over, so handle that for ancient architectures 327// 'clock' can easily roll over, so handle that for ancient architectures
317static __global__ void TimerSpin(uint64_t ns_to_spin) { 328static __device__ void TimerSpin(uint64_t ns_to_spin) {
318 uint64_t total_time = 0; 329 uint64_t total_time = 0;
319 uint32_t last_time = Clock32(); 330 uint32_t last_time = Clock32();
320 while (total_time < ns_to_spin) { 331 while (total_time < ns_to_spin) {
@@ -335,28 +346,70 @@ static __global__ void TimerSpin(uint64_t ns_to_spin) {
335#error Fermi-based GPUs (sm_2x) are unsupported! 346#error Fermi-based GPUs (sm_2x) are unsupported!
336#endif 347#endif
337 348
338// This function is intended to be run in a child process. Returns -1 on error. 349// Waits on and sets barriers for the CPU before and after spinning for a
350// specified number of clock ticks.
351static __global__ void BarrierTimerSpin(uint64_t ns_to_spin, volatile uint32_t *ready_barrier,
352 volatile uint32_t *start_barrier, volatile uint32_t *end_barrier) {
353 *ready_barrier = 1;
354 while (!*start_barrier)
355 continue;
356 TimerSpin(ns_to_spin);
357 *end_barrier = 1;
358}
359
360// This function returns the number of CPU ticks that pass per GPU tick. This is
361// intended to be run in a child process. Returns -1 on error.
339static double InternalGetGPUTimerScale(int cuda_device) { 362static double InternalGetGPUTimerScale(int cuda_device) {
340 struct timespec start, end; 363 volatile uint32_t *gpu_start_barrier, *start_barrier = NULL;
341 uint64_t nanoseconds_elapsed; 364 volatile uint32_t *gpu_end_barrier, *end_barrier = NULL;
365 volatile uint32_t *gpu_ready_barrier, *ready_barrier = NULL;
366 volatile uint64_t cpu_start, cpu_end;
367 double rate = -1;
342 if (!CheckCUDAError(cudaSetDevice(cuda_device))) return -1; 368 if (!CheckCUDAError(cudaSetDevice(cuda_device))) return -1;
343 // Run the kernel once to warm up the GPU. 369 if (!CheckCUDAError(cudaHostAlloc(&start_barrier, sizeof(*start_barrier),
344 TimerSpin<<<1, 1>>>(1000); 370 cudaHostAllocMapped))) goto out;
345 if (!CheckCUDAError(cudaDeviceSynchronize())) return -1; 371 if (!CheckCUDAError(cudaHostAlloc(&end_barrier, sizeof(*end_barrier),
346 // After warming up, do the actual timing. 372 cudaHostAllocMapped))) goto out;
347 if (clock_gettime(CLOCK_MONOTONIC_RAW, &start) != 0) { 373 if (!CheckCUDAError(cudaHostAlloc(&ready_barrier, sizeof(*ready_barrier),
348 printf("Failed getting start time.\n"); 374 cudaHostAllocMapped))) goto out;
349 return -1; 375 // Setup device pointers for all the barriers
350 } 376 if (!CheckCUDAError(cudaHostGetDevicePointer((uint32_t**)&gpu_start_barrier,
351 TimerSpin<<<1, 1>>>(TIMER_SPIN_DURATION); 377 (uint32_t*)start_barrier, 0))) goto out;
352 if (!CheckCUDAError(cudaDeviceSynchronize())) return -1; 378 if (!CheckCUDAError(cudaHostGetDevicePointer((uint32_t**)&gpu_end_barrier,
353 if (clock_gettime(CLOCK_MONOTONIC_RAW, &end) != 0) { 379 (uint32_t*)end_barrier, 0))) goto out;
354 printf("Failed getting end time.\n"); 380 if (!CheckCUDAError(cudaHostGetDevicePointer((uint32_t**)&gpu_ready_barrier,
355 return -1; 381 (uint32_t*)ready_barrier, 0))) goto out;
356 } 382 // Run the kernel a first time to warm up the GPU.
357 nanoseconds_elapsed = end.tv_sec * 1e9 + end.tv_nsec; 383 BarrierTimerSpin<<<1, 1>>>(100, gpu_ready_barrier, gpu_start_barrier,
358 nanoseconds_elapsed -= start.tv_sec * 1e9 + start.tv_nsec; 384 gpu_end_barrier);
359 return ((double) nanoseconds_elapsed) / ((double) TIMER_SPIN_DURATION); 385 // Barrier flows works very similarly here as in InternalReadGPUNanoseconds(),
386 // except we spin for the specified number of ticks between the start and end
387 // barriers.
388 *start_barrier = 1;
389 if (!CheckCUDAError(cudaDeviceSynchronize())) goto out;
390 // Now run the actual time-checking kernel.
391 *start_barrier = 0;
392 *end_barrier = 0;
393 *ready_barrier = 0;
394 BarrierTimerSpin<<<1, 1>>>(TIMER_SPIN_DURATION, gpu_ready_barrier, gpu_start_barrier,
395 gpu_end_barrier);
396 // Wait for kernel to initialize
397 while (!*ready_barrier)
398 continue;
399 // Immediately record CPU time and tell GPU kernel to start spinning
400 cpu_start = CurrentNanoseconds();
401 *start_barrier = 1;
402 // Wait for kernel to finish spinning, and immediately record CPU time again
403 while (!*end_barrier)
404 continue;
405 cpu_end = CurrentNanoseconds();
406 // The rate is number of CPU ticks per GPU tick
407 rate = (cpu_end - cpu_start) / ((double) TIMER_SPIN_DURATION);
408out:
409 cudaFree((void*)start_barrier);
410 cudaFree((void*)end_barrier);
411 cudaFree((void*)ready_barrier);
412 return rate;
360} 413}
361 414
362double GetGPUTimerScale(int cuda_device) { 415double GetGPUTimerScale(int cuda_device) {