1 files changed, 16 insertions, 7 deletions
diff --git a/task_host_utilities.cu b/task_host_utilities.cu
index e35292f..9bb95aa 100644
--- a/task_host_utilities.cu
+++ b/task_host_utilities.cu
@@ -1,5 +1,7 @@
-// This file contains the implementation of the functions defined in
+// Copyright 2017-2019 Nathan Otterness
-// task_host_utilities.h--used by task_host_utilities.c to work with the GPU.
+// Copyright 2020-2024 Joshua Bakita
+// This file was originally part of the cuda_scheduling_examiner tool, but has
+// been copied and heavily modified for use in the gpu-microbench project.
 #include <cuda_runtime.h>
 #include <errno.h>
 #include <stdint.h>
@@ -46,14 +48,21 @@ static double CurrentSeconds(void) {
 static __device__ inline uint64_t GlobalTimer64(void) {
  uint32_t lo_bits, hi_bits, hi_bits_2;
  uint64_t ret;
-  // Upper bits may rollover between our 1st and 2nd read
+  // The clock can roll over while we read the lo_bits in two cases:
-  // (The bug seems constrained to certain old Jetson boards, so this
+  // 1. We read the hi_bits right as they changed, and lo_bits should be 0.
-  // workaround could probably be gated to only those GPUs.)
+  // 2. We were preempted between reading hi_bits and hi_bits_2. In this case
+  //    we need to re-read lo_bits, as they could be anything. Note that we
+  //    assume that we'll never be suspended in an instruction-level preemption
+  //    for more than ~4.29 seconds (which is how long it would take for the
+  //    high bits to roll over again)---this allows us to just reread lo_bits.
+  //    We also assume that this function will be preempted at at most one
+  //    point during each invocation (the default timeslice length is 1-2 ms,
+  //    so this assumption should always hold with the default scheduler).
  asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits));
  asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits));
  asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits_2));
-  // If upper bits rolled over, lo_bits = 0
+  if (hi_bits != hi_bits_2)
-  lo_bits = (hi_bits != hi_bits_2) ? 0 : lo_bits;
+    asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits));
  // SASS on older architectures (such as sm_52) is natively 32-bit, so the
  // following three lines get optimized out.
  ret = hi_bits_2;