diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2024-02-20 16:03:38 -0500 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2024-02-20 16:03:38 -0500 |
commit | 2accc2be54d3f9ad20d15f21bca6397ef6cabf92 (patch) | |
tree | 34f9d054f798173bdb3c610eadd2e17d201fb01b | |
parent | d90826c1cc5f03fdc0aaef5bf20c57aec6556940 (diff) |
Rewrite clock rate synchronization code to increase accuracy
Mirrors architecture used for instantaneous clock synchronization.
The PPM (parts-per-million) error after a two-second
synchronization period (using results from new implementation after
a 100-second synchronization period as ground truth, as after 100
seconds, the new and old implementation disagree by at most 2 PPM).
GPU (machine name) Before After
------------------------------ ------ ------
GTX 1060 3 GiB (jbakita-old): 12 1
GTX 970 (jbakita-old): 9 2
GTX 1080 Ti (yamaha): 5 1
RTX 6000 Ada (yamaha): 4 2
(An error of 1 PPM is 1 microsecond per second.)
Also modify `preemption_logger` to print the rate of skew as PPM,
rather than just as a multiplier.
-rw-r--r-- | preemption_logger.cu | 11 | ||||
-rw-r--r-- | task_host_utilities.cu | 97 |
2 files changed, 82 insertions, 26 deletions
diff --git a/preemption_logger.cu b/preemption_logger.cu index 89b348a..ec3c22e 100644 --- a/preemption_logger.cu +++ b/preemption_logger.cu | |||
@@ -90,10 +90,11 @@ int main(int argc, char **argv) { | |||
90 | // Synchronize the GPU and CPU clocks (if requested) using the utilities | 90 | // Synchronize the GPU and CPU clocks (if requested) using the utilities |
91 | // from task_host_utilities.cu. | 91 | // from task_host_utilities.cu. |
92 | if (!skip_conversion) { | 92 | if (!skip_conversion) { |
93 | // Note that this appears to almost always be 1.0 on recent GPUs | 93 | // The skew is between -13 and 60 microseconds per second on the GTX |
94 | d2h_scale = GetGPUTimerScale(0); | 94 | // 1080, GTX 1060 3 GiB, GTX 970, RTX 6000 Ada, and GTX 1080 Ti. |
95 | d2h_scale = InternalGetGPUTimerScale(0); | ||
95 | InternalReadGPUNanoseconds(0, &host_s, &dev_ns); | 96 | InternalReadGPUNanoseconds(0, &host_s, &dev_ns); |
96 | if (host_s == 0 && !dev_ns) { | 97 | if (d2h_scale == -1 || (host_s == 0 && !dev_ns)) { |
97 | fprintf(stderr, "Unable to synchronize time with the GPU. Aborting...\n"); | 98 | fprintf(stderr, "Unable to synchronize time with the GPU. Aborting...\n"); |
98 | return 1; | 99 | return 1; |
99 | } | 100 | } |
@@ -153,7 +154,9 @@ int main(int argc, char **argv) { | |||
153 | fprintf(stderr, "(%d) Aprox launch overhead: %ld ns\n", pid, launch_oh); | 154 | fprintf(stderr, "(%d) Aprox launch overhead: %ld ns\n", pid, launch_oh); |
154 | fprintf(stderr, "(%d) CPU clock - GPU clock: %ld tick gap\n", pid, | 155 | fprintf(stderr, "(%d) CPU clock - GPU clock: %ld tick gap\n", pid, |
155 | (long)s2ns(host_s) - dev_ns); | 156 | (long)s2ns(host_s) - dev_ns); |
156 | fprintf(stderr, "(%d) 1 CPU tick/1 GPU tick: %.3f\n", pid, d2h_scale); | 157 | fprintf(stderr, "(%d) After 1 second, the GPU clock is %.f us %s (%.9fx d2h)\n", |
158 | pid, fabs((d2h_scale - 1) * 1e6), | ||
159 | d2h_scale > 1 ? "behind" : "ahead", d2h_scale); | ||
157 | } | 160 | } |
158 | 161 | ||
159 | return 0; | 162 | return 0; |
diff --git a/task_host_utilities.cu b/task_host_utilities.cu index 1f76080..1afdaf0 100644 --- a/task_host_utilities.cu +++ b/task_host_utilities.cu | |||
@@ -40,6 +40,17 @@ static double CurrentSeconds(void) { | |||
40 | return ((double) ts.tv_sec) + (((double) ts.tv_nsec) / 1e9); | 40 | return ((double) ts.tv_sec) + (((double) ts.tv_nsec) / 1e9); |
41 | } | 41 | } |
42 | 42 | ||
43 | // Note that CLOCK_MONOTONIC_RAW is slow before Linux 5.3 because it's not | ||
44 | // supported by vDSO. | ||
45 | static uint64_t CurrentNanoseconds(void) { | ||
46 | struct timespec ts; | ||
47 | if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts) != 0) { | ||
48 | printf("Error getting time.\n"); | ||
49 | exit(1); | ||
50 | } | ||
51 | return ((uint64_t) ts.tv_sec) * 1000*1000*1000 + ((uint64_t) ts.tv_nsec); | ||
52 | } | ||
53 | |||
43 | // GlobalTimer64: Get 64-bit counter of current time on GPU | 54 | // GlobalTimer64: Get 64-bit counter of current time on GPU |
44 | // ***This is duplicated in benchmark_gpu_utilities.h*** | 55 | // ***This is duplicated in benchmark_gpu_utilities.h*** |
45 | #if __CUDA_ARCH__ >= 300 // Kepler+ | 56 | #if __CUDA_ARCH__ >= 300 // Kepler+ |
@@ -299,7 +310,7 @@ int GetMaxResidentThreads(int cuda_device) { | |||
299 | } | 310 | } |
300 | 311 | ||
301 | #if __CUDA_ARCH__ >= 300 // Kepler+ | 312 | #if __CUDA_ARCH__ >= 300 // Kepler+ |
302 | static __global__ void TimerSpin(uint64_t ns_to_spin) { | 313 | static __device__ void TimerSpin(uint64_t ns_to_spin) { |
303 | uint64_t start_time = GlobalTimer64(); | 314 | uint64_t start_time = GlobalTimer64(); |
304 | while ((GlobalTimer64() - start_time) < ns_to_spin) { | 315 | while ((GlobalTimer64() - start_time) < ns_to_spin) { |
305 | continue; | 316 | continue; |
@@ -314,7 +325,7 @@ static __device__ inline uint32_t Clock32(void) { | |||
314 | } | 325 | } |
315 | 326 | ||
316 | // 'clock' can easily roll over, so handle that for ancient architectures | 327 | // 'clock' can easily roll over, so handle that for ancient architectures |
317 | static __global__ void TimerSpin(uint64_t ns_to_spin) { | 328 | static __device__ void TimerSpin(uint64_t ns_to_spin) { |
318 | uint64_t total_time = 0; | 329 | uint64_t total_time = 0; |
319 | uint32_t last_time = Clock32(); | 330 | uint32_t last_time = Clock32(); |
320 | while (total_time < ns_to_spin) { | 331 | while (total_time < ns_to_spin) { |
@@ -335,28 +346,70 @@ static __global__ void TimerSpin(uint64_t ns_to_spin) { | |||
335 | #error Fermi-based GPUs (sm_2x) are unsupported! | 346 | #error Fermi-based GPUs (sm_2x) are unsupported! |
336 | #endif | 347 | #endif |
337 | 348 | ||
338 | // This function is intended to be run in a child process. Returns -1 on error. | 349 | // Waits on and sets barriers for the CPU before and after spinning for a |
350 | // specified number of clock ticks. | ||
351 | static __global__ void BarrierTimerSpin(uint64_t ns_to_spin, volatile uint32_t *ready_barrier, | ||
352 | volatile uint32_t *start_barrier, volatile uint32_t *end_barrier) { | ||
353 | *ready_barrier = 1; | ||
354 | while (!*start_barrier) | ||
355 | continue; | ||
356 | TimerSpin(ns_to_spin); | ||
357 | *end_barrier = 1; | ||
358 | } | ||
359 | |||
360 | // This function returns the number of CPU ticks that pass per GPU tick. This is | ||
361 | // intended to be run in a child process. Returns -1 on error. | ||
339 | static double InternalGetGPUTimerScale(int cuda_device) { | 362 | static double InternalGetGPUTimerScale(int cuda_device) { |
340 | struct timespec start, end; | 363 | volatile uint32_t *gpu_start_barrier, *start_barrier = NULL; |
341 | uint64_t nanoseconds_elapsed; | 364 | volatile uint32_t *gpu_end_barrier, *end_barrier = NULL; |
365 | volatile uint32_t *gpu_ready_barrier, *ready_barrier = NULL; | ||
366 | volatile uint64_t cpu_start, cpu_end; | ||
367 | double rate = -1; | ||
342 | if (!CheckCUDAError(cudaSetDevice(cuda_device))) return -1; | 368 | if (!CheckCUDAError(cudaSetDevice(cuda_device))) return -1; |
343 | // Run the kernel once to warm up the GPU. | 369 | if (!CheckCUDAError(cudaHostAlloc(&start_barrier, sizeof(*start_barrier), |
344 | TimerSpin<<<1, 1>>>(1000); | 370 | cudaHostAllocMapped))) goto out; |
345 | if (!CheckCUDAError(cudaDeviceSynchronize())) return -1; | 371 | if (!CheckCUDAError(cudaHostAlloc(&end_barrier, sizeof(*end_barrier), |
346 | // After warming up, do the actual timing. | 372 | cudaHostAllocMapped))) goto out; |
347 | if (clock_gettime(CLOCK_MONOTONIC_RAW, &start) != 0) { | 373 | if (!CheckCUDAError(cudaHostAlloc(&ready_barrier, sizeof(*ready_barrier), |
348 | printf("Failed getting start time.\n"); | 374 | cudaHostAllocMapped))) goto out; |
349 | return -1; | 375 | // Setup device pointers for all the barriers |
350 | } | 376 | if (!CheckCUDAError(cudaHostGetDevicePointer((uint32_t**)&gpu_start_barrier, |
351 | TimerSpin<<<1, 1>>>(TIMER_SPIN_DURATION); | 377 | (uint32_t*)start_barrier, 0))) goto out; |
352 | if (!CheckCUDAError(cudaDeviceSynchronize())) return -1; | 378 | if (!CheckCUDAError(cudaHostGetDevicePointer((uint32_t**)&gpu_end_barrier, |
353 | if (clock_gettime(CLOCK_MONOTONIC_RAW, &end) != 0) { | 379 | (uint32_t*)end_barrier, 0))) goto out; |
354 | printf("Failed getting end time.\n"); | 380 | if (!CheckCUDAError(cudaHostGetDevicePointer((uint32_t**)&gpu_ready_barrier, |
355 | return -1; | 381 | (uint32_t*)ready_barrier, 0))) goto out; |
356 | } | 382 | // Run the kernel a first time to warm up the GPU. |
357 | nanoseconds_elapsed = end.tv_sec * 1e9 + end.tv_nsec; | 383 | BarrierTimerSpin<<<1, 1>>>(100, gpu_ready_barrier, gpu_start_barrier, |
358 | nanoseconds_elapsed -= start.tv_sec * 1e9 + start.tv_nsec; | 384 | gpu_end_barrier); |
359 | return ((double) nanoseconds_elapsed) / ((double) TIMER_SPIN_DURATION); | 385 | // Barrier flows works very similarly here as in InternalReadGPUNanoseconds(), |
386 | // except we spin for the specified number of ticks between the start and end | ||
387 | // barriers. | ||
388 | *start_barrier = 1; | ||
389 | if (!CheckCUDAError(cudaDeviceSynchronize())) goto out; | ||
390 | // Now run the actual time-checking kernel. | ||
391 | *start_barrier = 0; | ||
392 | *end_barrier = 0; | ||
393 | *ready_barrier = 0; | ||
394 | BarrierTimerSpin<<<1, 1>>>(TIMER_SPIN_DURATION, gpu_ready_barrier, gpu_start_barrier, | ||
395 | gpu_end_barrier); | ||
396 | // Wait for kernel to initialize | ||
397 | while (!*ready_barrier) | ||
398 | continue; | ||
399 | // Immediately record CPU time and tell GPU kernel to start spinning | ||
400 | cpu_start = CurrentNanoseconds(); | ||
401 | *start_barrier = 1; | ||
402 | // Wait for kernel to finish spinning, and immediately record CPU time again | ||
403 | while (!*end_barrier) | ||
404 | continue; | ||
405 | cpu_end = CurrentNanoseconds(); | ||
406 | // The rate is number of CPU ticks per GPU tick | ||
407 | rate = (cpu_end - cpu_start) / ((double) TIMER_SPIN_DURATION); | ||
408 | out: | ||
409 | cudaFree((void*)start_barrier); | ||
410 | cudaFree((void*)end_barrier); | ||
411 | cudaFree((void*)ready_barrier); | ||
412 | return rate; | ||
360 | } | 413 | } |
361 | 414 | ||
362 | double GetGPUTimerScale(int cuda_device) { | 415 | double GetGPUTimerScale(int cuda_device) { |