Rewrite clock rate synchronization code to increase accuracy

Mirrors architecture used for instantaneous clock synchronization. The PPM (parts-per-million) error after a two-second synchronization period (using results from new implementation after a 100-second synchronization period as ground truth, as after 100 seconds, the new and old implementation disagree by at most 2 PPM). GPU (machine name) Before After ------------------------------ ------ ------ GTX 1060 3 GiB (jbakita-old): 12 1 GTX 970 (jbakita-old): 9 2 GTX 1080 Ti (yamaha): 5 1 RTX 6000 Ada (yamaha): 4 2 (An error of 1 PPM is 1 microsecond per second.) Also modify `preemption_logger` to print the rate of skew as PPM, rather than just as a multiplier.
author: Joshua Bakita <bakitajoshua@gmail.com> 2024-02-20 16:03:38 -0500
committer: Joshua Bakita <bakitajoshua@gmail.com> 2024-02-20 16:03:38 -0500
commit: 2accc2be54d3f9ad20d15f21bca6397ef6cabf92 (patch)
tree: 34f9d054f798173bdb3c610eadd2e17d201fb01b
parent: d90826c1cc5f03fdc0aaef5bf20c57aec6556940 (diff)
2 files changed, 82 insertions, 26 deletions
diff --git a/preemption_logger.cu b/preemption_logger.cu
index 89b348a..ec3c22e 100644
--- a/preemption_logger.cu
+++ b/preemption_logger.cu
@@ -90,10 +90,11 @@ int main(int argc, char **argv) {
        // Synchronize the GPU and CPU clocks (if requested) using the utilities
        // from task_host_utilities.cu.
        if (!skip_conversion) {
-                // Note that this appears to almost always be 1.0 on recent GPUs
+                // The skew is between -13 and 60 microseconds per second on the GTX
-                d2h_scale = GetGPUTimerScale(0);
+                // 1080, GTX 1060 3 GiB, GTX 970, RTX 6000 Ada, and GTX 1080 Ti.
+                d2h_scale = InternalGetGPUTimerScale(0);
                InternalReadGPUNanoseconds(0, &host_s, &dev_ns);
-                if (host_s == 0 && !dev_ns) {
+                if (d2h_scale == -1 || (host_s == 0 && !dev_ns)) {
                        fprintf(stderr, "Unable to synchronize time with the GPU. Aborting...\n");
                        return 1;
                }
@@ -153,7 +154,9 @@ int main(int argc, char **argv) {
                fprintf(stderr, "(%d) Aprox launch overhead: %ld ns\n", pid, launch_oh);
                fprintf(stderr, "(%d) CPU clock - GPU clock: %ld tick gap\n", pid,
                    (long)s2ns(host_s) - dev_ns);
-                fprintf(stderr, "(%d) 1 CPU tick/1 GPU tick: %.3f\n", pid, d2h_scale);
+                fprintf(stderr, "(%d) After 1 second, the GPU clock is %.f us %s (%.9fx d2h)\n",
+                    pid, fabs((d2h_scale - 1) * 1e6),
+                    d2h_scale > 1 ? "behind" : "ahead", d2h_scale);
        }
        return 0;
diff --git a/task_host_utilities.cu b/task_host_utilities.cu
index 1f76080..1afdaf0 100644
--- a/task_host_utilities.cu
+++ b/task_host_utilities.cu
@@ -40,6 +40,17 @@ static double CurrentSeconds(void) {
  return ((double) ts.tv_sec) + (((double) ts.tv_nsec) / 1e9);
 }
+// Note that CLOCK_MONOTONIC_RAW is slow before Linux 5.3 because it's not
+// supported by vDSO.
+static uint64_t CurrentNanoseconds(void) {
+  struct timespec ts;
+  if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts) != 0) {
+    printf("Error getting time.\n");
+    exit(1);
+  }
+  return ((uint64_t) ts.tv_sec) * 1000*1000*1000 + ((uint64_t) ts.tv_nsec);
+}
 // GlobalTimer64: Get 64-bit counter of current time on GPU
 // ***This is duplicated in benchmark_gpu_utilities.h***
 #if __CUDA_ARCH__ >= 300 // Kepler+
@@ -299,7 +310,7 @@ int GetMaxResidentThreads(int cuda_device) {
 }
 #if __CUDA_ARCH__ >= 300 // Kepler+
-static __global__ void TimerSpin(uint64_t ns_to_spin) {
+static __device__ void TimerSpin(uint64_t ns_to_spin) {
  uint64_t start_time = GlobalTimer64();
  while ((GlobalTimer64() - start_time) < ns_to_spin) {
    continue;
@@ -314,7 +325,7 @@ static __device__ inline uint32_t Clock32(void) {
 }
 // 'clock' can easily roll over, so handle that for ancient architectures
-static __global__ void TimerSpin(uint64_t ns_to_spin) {
+static __device__ void TimerSpin(uint64_t ns_to_spin) {
  uint64_t total_time = 0;
  uint32_t last_time = Clock32();
  while (total_time < ns_to_spin) {
@@ -335,28 +346,70 @@ static __global__ void TimerSpin(uint64_t ns_to_spin) {
 #error Fermi-based GPUs (sm_2x) are unsupported!
 #endif
-// This function is intended to be run in a child process. Returns -1 on error.
+// Waits on and sets barriers for the CPU before and after spinning for a
+// specified number of clock ticks.
+static __global__ void BarrierTimerSpin(uint64_t ns_to_spin, volatile uint32_t *ready_barrier,
+    volatile uint32_t *start_barrier, volatile uint32_t *end_barrier) {
+  *ready_barrier = 1;
+  while (!*start_barrier)
+    continue;
+  TimerSpin(ns_to_spin);
+  *end_barrier = 1;
+}
+// This function returns the number of CPU ticks that pass per GPU tick. This is
+// intended to be run in a child process. Returns -1 on error.
 static double InternalGetGPUTimerScale(int cuda_device) {
-  struct timespec start, end;
+  volatile uint32_t *gpu_start_barrier, *start_barrier = NULL;
-  uint64_t nanoseconds_elapsed;
+  volatile uint32_t *gpu_end_barrier, *end_barrier = NULL;
+  volatile uint32_t *gpu_ready_barrier, *ready_barrier = NULL;
+  volatile uint64_t cpu_start, cpu_end;
+  double rate = -1;
  if (!CheckCUDAError(cudaSetDevice(cuda_device))) return -1;
-  // Run the kernel once to warm up the GPU.
+  if (!CheckCUDAError(cudaHostAlloc(&start_barrier, sizeof(*start_barrier),
-  TimerSpin<<<1, 1>>>(1000);
+    cudaHostAllocMapped))) goto out;
-  if (!CheckCUDAError(cudaDeviceSynchronize())) return -1;
+  if (!CheckCUDAError(cudaHostAlloc(&end_barrier, sizeof(*end_barrier),
-  // After warming up, do the actual timing.
+    cudaHostAllocMapped))) goto out;
-  if (clock_gettime(CLOCK_MONOTONIC_RAW, &start) != 0) {
+  if (!CheckCUDAError(cudaHostAlloc(&ready_barrier, sizeof(*ready_barrier),
-    printf("Failed getting start time.\n");
+    cudaHostAllocMapped))) goto out;
-    return -1;
+  // Setup device pointers for all the barriers
-  }
+  if (!CheckCUDAError(cudaHostGetDevicePointer((uint32_t**)&gpu_start_barrier,
-  TimerSpin<<<1, 1>>>(TIMER_SPIN_DURATION);
+    (uint32_t*)start_barrier, 0))) goto out;
-  if (!CheckCUDAError(cudaDeviceSynchronize())) return -1;
+  if (!CheckCUDAError(cudaHostGetDevicePointer((uint32_t**)&gpu_end_barrier,
-  if (clock_gettime(CLOCK_MONOTONIC_RAW, &end) != 0) {
+    (uint32_t*)end_barrier, 0))) goto out;
-    printf("Failed getting end time.\n");
+  if (!CheckCUDAError(cudaHostGetDevicePointer((uint32_t**)&gpu_ready_barrier,
-    return -1;
+    (uint32_t*)ready_barrier, 0))) goto out;
-  }
+  // Run the kernel a first time to warm up the GPU.
-  nanoseconds_elapsed = end.tv_sec * 1e9 + end.tv_nsec;
+  BarrierTimerSpin<<<1, 1>>>(100, gpu_ready_barrier, gpu_start_barrier,
-  nanoseconds_elapsed -= start.tv_sec * 1e9 + start.tv_nsec;
+    gpu_end_barrier);
-  return ((double) nanoseconds_elapsed) / ((double) TIMER_SPIN_DURATION);
+  // Barrier flows works very similarly here as in InternalReadGPUNanoseconds(),
+  // except we spin for the specified number of ticks between the start and end
+  // barriers.
+  *start_barrier = 1;
+  if (!CheckCUDAError(cudaDeviceSynchronize())) goto out;
+  // Now run the actual time-checking kernel.
+  *start_barrier = 0;
+  *end_barrier = 0;
+  *ready_barrier = 0;
+  BarrierTimerSpin<<<1, 1>>>(TIMER_SPIN_DURATION, gpu_ready_barrier, gpu_start_barrier,
+    gpu_end_barrier);
+  // Wait for kernel to initialize
+  while (!*ready_barrier)
+    continue;
+  // Immediately record CPU time and tell GPU kernel to start spinning
+  cpu_start = CurrentNanoseconds();
+  *start_barrier = 1;
+  // Wait for kernel to finish spinning, and immediately record CPU time again
+  while (!*end_barrier)
+    continue;
+  cpu_end = CurrentNanoseconds();
+  // The rate is number of CPU ticks per GPU tick
+  rate = (cpu_end - cpu_start) / ((double) TIMER_SPIN_DURATION);
+out:
+  cudaFree((void*)start_barrier);
+  cudaFree((void*)end_barrier);
+  cudaFree((void*)ready_barrier);
+  return rate;
 }
 double GetGPUTimerScale(int cuda_device) {
author	Joshua Bakita <bakitajoshua@gmail.com>	2024-02-20 16:03:38 -0500
committer	Joshua Bakita <bakitajoshua@gmail.com>	2024-02-20 16:03:38 -0500
commit	2accc2be54d3f9ad20d15f21bca6397ef6cabf92 (patch)
tree	34f9d054f798173bdb3c610eadd2e17d201fb01b
parent	d90826c1cc5f03fdc0aaef5bf20c57aec6556940 (diff)