From 18641058a2d60e172f18176b41d51baa706ffd85 Mon Sep 17 00:00:00 2001
From: Joshua Bakita <bakitajoshua@gmail.com>
Date: Mon, 19 Feb 2024 19:50:29 -0500
Subject: Remove a potential cause of timing aberrations

It is possible to be preempted in the middle of reading
`globalclock64`. When this causes mismatched readings of the high
bits, we assumed the low bits were zero (as in the clock rollover
case). In the preemption case, this resulted in a time reading
that appears to be in the middle of another process's execution.

Correct this by re-reading the low bits when the high bits do not
match.

(If a short preemption occurs and the high bits match, it does not
matter the reading of the low bits occured before or after the
preemption, as it will still be consistent with the high bits.)

This bug likely affected `preemption_logger`, since that benchmark
rapidly reads timestamps in a tight loop and is thus highly likely
to be preempted mid-read.
---
 task_host_utilities.cu | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/task_host_utilities.cu b/task_host_utilities.cu
index e35292f..9bb95aa 100644
--- a/task_host_utilities.cu
+++ b/task_host_utilities.cu
@@ -1,5 +1,7 @@
-// This file contains the implementation of the functions defined in
-// task_host_utilities.h--used by task_host_utilities.c to work with the GPU.
+// Copyright 2017-2019 Nathan Otterness
+// Copyright 2020-2024 Joshua Bakita
+// This file was originally part of the cuda_scheduling_examiner tool, but has
+// been copied and heavily modified for use in the gpu-microbench project.
 #include <cuda_runtime.h>
 #include <errno.h>
 #include <stdint.h>
@@ -46,14 +48,21 @@ static double CurrentSeconds(void) {
 static __device__ inline uint64_t GlobalTimer64(void) {
   uint32_t lo_bits, hi_bits, hi_bits_2;
   uint64_t ret;
-  // Upper bits may rollover between our 1st and 2nd read
-  // (The bug seems constrained to certain old Jetson boards, so this
-  // workaround could probably be gated to only those GPUs.)
+  // The clock can roll over while we read the lo_bits in two cases:
+  // 1. We read the hi_bits right as they changed, and lo_bits should be 0.
+  // 2. We were preempted between reading hi_bits and hi_bits_2. In this case
+  //    we need to re-read lo_bits, as they could be anything. Note that we
+  //    assume that we'll never be suspended in an instruction-level preemption
+  //    for more than ~4.29 seconds (which is how long it would take for the
+  //    high bits to roll over again)---this allows us to just reread lo_bits.
+  //    We also assume that this function will be preempted at at most one
+  //    point during each invocation (the default timeslice length is 1-2 ms,
+  //    so this assumption should always hold with the default scheduler).
   asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits));
   asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits));
   asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits_2));
-  // If upper bits rolled over, lo_bits = 0
-  lo_bits = (hi_bits != hi_bits_2) ? 0 : lo_bits;
+  if (hi_bits != hi_bits_2)
+    asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits));
   // SASS on older architectures (such as sm_52) is natively 32-bit, so the
   // following three lines get optimized out.
   ret = hi_bits_2;
-- 
cgit v1.2.2