summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2024-02-19 19:50:29 -0500
committerJoshua Bakita <bakitajoshua@gmail.com>2024-02-19 20:31:11 -0500
commit18641058a2d60e172f18176b41d51baa706ffd85 (patch)
tree2d0a1a12c0dcb8a377eaaed4eb25ddf7fcf7372a
parenta1418db164af92d350234dfe6846884562c9e227 (diff)
Remove a potential cause of timing aberrations
It is possible to be preempted in the middle of reading `globalclock64`. When this causes mismatched readings of the high bits, we assumed the low bits were zero (as in the clock rollover case). In the preemption case, this resulted in a time reading that appears to be in the middle of another process's execution. Correct this by re-reading the low bits when the high bits do not match. (If a short preemption occurs and the high bits match, it does not matter the reading of the low bits occured before or after the preemption, as it will still be consistent with the high bits.) This bug likely affected `preemption_logger`, since that benchmark rapidly reads timestamps in a tight loop and is thus highly likely to be preempted mid-read.
-rw-r--r--task_host_utilities.cu23
1 files changed, 16 insertions, 7 deletions
diff --git a/task_host_utilities.cu b/task_host_utilities.cu
index e35292f..9bb95aa 100644
--- a/task_host_utilities.cu
+++ b/task_host_utilities.cu
@@ -1,5 +1,7 @@
1// This file contains the implementation of the functions defined in 1// Copyright 2017-2019 Nathan Otterness
2// task_host_utilities.h--used by task_host_utilities.c to work with the GPU. 2// Copyright 2020-2024 Joshua Bakita
3// This file was originally part of the cuda_scheduling_examiner tool, but has
4// been copied and heavily modified for use in the gpu-microbench project.
3#include <cuda_runtime.h> 5#include <cuda_runtime.h>
4#include <errno.h> 6#include <errno.h>
5#include <stdint.h> 7#include <stdint.h>
@@ -46,14 +48,21 @@ static double CurrentSeconds(void) {
46static __device__ inline uint64_t GlobalTimer64(void) { 48static __device__ inline uint64_t GlobalTimer64(void) {
47 uint32_t lo_bits, hi_bits, hi_bits_2; 49 uint32_t lo_bits, hi_bits, hi_bits_2;
48 uint64_t ret; 50 uint64_t ret;
49 // Upper bits may rollover between our 1st and 2nd read 51 // The clock can roll over while we read the lo_bits in two cases:
50 // (The bug seems constrained to certain old Jetson boards, so this 52 // 1. We read the hi_bits right as they changed, and lo_bits should be 0.
51 // workaround could probably be gated to only those GPUs.) 53 // 2. We were preempted between reading hi_bits and hi_bits_2. In this case
54 // we need to re-read lo_bits, as they could be anything. Note that we
55 // assume that we'll never be suspended in an instruction-level preemption
56 // for more than ~4.29 seconds (which is how long it would take for the
57 // high bits to roll over again)---this allows us to just reread lo_bits.
58 // We also assume that this function will be preempted at at most one
59 // point during each invocation (the default timeslice length is 1-2 ms,
60 // so this assumption should always hold with the default scheduler).
52 asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits)); 61 asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits));
53 asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits)); 62 asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits));
54 asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits_2)); 63 asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits_2));
55 // If upper bits rolled over, lo_bits = 0 64 if (hi_bits != hi_bits_2)
56 lo_bits = (hi_bits != hi_bits_2) ? 0 : lo_bits; 65 asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits));
57 // SASS on older architectures (such as sm_52) is natively 32-bit, so the 66 // SASS on older architectures (such as sm_52) is natively 32-bit, so the
58 // following three lines get optimized out. 67 // following three lines get optimized out.
59 ret = hi_bits_2; 68 ret = hi_bits_2;