diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2024-02-19 19:50:29 -0500 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2024-02-19 20:31:11 -0500 |
commit | 18641058a2d60e172f18176b41d51baa706ffd85 (patch) | |
tree | 2d0a1a12c0dcb8a377eaaed4eb25ddf7fcf7372a | |
parent | a1418db164af92d350234dfe6846884562c9e227 (diff) |
Remove a potential cause of timing aberrations
It is possible to be preempted in the middle of reading
`globalclock64`. When this causes mismatched readings of the high
bits, we assumed the low bits were zero (as in the clock rollover
case). In the preemption case, this resulted in a time reading
that appears to be in the middle of another process's execution.
Correct this by re-reading the low bits when the high bits do not
match.
(If a short preemption occurs and the high bits match, it does not
matter the reading of the low bits occured before or after the
preemption, as it will still be consistent with the high bits.)
This bug likely affected `preemption_logger`, since that benchmark
rapidly reads timestamps in a tight loop and is thus highly likely
to be preempted mid-read.
-rw-r--r-- | task_host_utilities.cu | 23 |
1 files changed, 16 insertions, 7 deletions
diff --git a/task_host_utilities.cu b/task_host_utilities.cu index e35292f..9bb95aa 100644 --- a/task_host_utilities.cu +++ b/task_host_utilities.cu | |||
@@ -1,5 +1,7 @@ | |||
1 | // This file contains the implementation of the functions defined in | 1 | // Copyright 2017-2019 Nathan Otterness |
2 | // task_host_utilities.h--used by task_host_utilities.c to work with the GPU. | 2 | // Copyright 2020-2024 Joshua Bakita |
3 | // This file was originally part of the cuda_scheduling_examiner tool, but has | ||
4 | // been copied and heavily modified for use in the gpu-microbench project. | ||
3 | #include <cuda_runtime.h> | 5 | #include <cuda_runtime.h> |
4 | #include <errno.h> | 6 | #include <errno.h> |
5 | #include <stdint.h> | 7 | #include <stdint.h> |
@@ -46,14 +48,21 @@ static double CurrentSeconds(void) { | |||
46 | static __device__ inline uint64_t GlobalTimer64(void) { | 48 | static __device__ inline uint64_t GlobalTimer64(void) { |
47 | uint32_t lo_bits, hi_bits, hi_bits_2; | 49 | uint32_t lo_bits, hi_bits, hi_bits_2; |
48 | uint64_t ret; | 50 | uint64_t ret; |
49 | // Upper bits may rollover between our 1st and 2nd read | 51 | // The clock can roll over while we read the lo_bits in two cases: |
50 | // (The bug seems constrained to certain old Jetson boards, so this | 52 | // 1. We read the hi_bits right as they changed, and lo_bits should be 0. |
51 | // workaround could probably be gated to only those GPUs.) | 53 | // 2. We were preempted between reading hi_bits and hi_bits_2. In this case |
54 | // we need to re-read lo_bits, as they could be anything. Note that we | ||
55 | // assume that we'll never be suspended in an instruction-level preemption | ||
56 | // for more than ~4.29 seconds (which is how long it would take for the | ||
57 | // high bits to roll over again)---this allows us to just reread lo_bits. | ||
58 | // We also assume that this function will be preempted at at most one | ||
59 | // point during each invocation (the default timeslice length is 1-2 ms, | ||
60 | // so this assumption should always hold with the default scheduler). | ||
52 | asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits)); | 61 | asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits)); |
53 | asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits)); | 62 | asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits)); |
54 | asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits_2)); | 63 | asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits_2)); |
55 | // If upper bits rolled over, lo_bits = 0 | 64 | if (hi_bits != hi_bits_2) |
56 | lo_bits = (hi_bits != hi_bits_2) ? 0 : lo_bits; | 65 | asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits)); |
57 | // SASS on older architectures (such as sm_52) is natively 32-bit, so the | 66 | // SASS on older architectures (such as sm_52) is natively 32-bit, so the |
58 | // following three lines get optimized out. | 67 | // following three lines get optimized out. |
59 | ret = hi_bits_2; | 68 | ret = hi_bits_2; |