diff options
-rw-r--r-- | task_host_utilities.cu | 23 |
1 files changed, 16 insertions, 7 deletions
diff --git a/task_host_utilities.cu b/task_host_utilities.cu index e35292f..9bb95aa 100644 --- a/task_host_utilities.cu +++ b/task_host_utilities.cu | |||
@@ -1,5 +1,7 @@ | |||
1 | // This file contains the implementation of the functions defined in | 1 | // Copyright 2017-2019 Nathan Otterness |
2 | // task_host_utilities.h--used by task_host_utilities.c to work with the GPU. | 2 | // Copyright 2020-2024 Joshua Bakita |
3 | // This file was originally part of the cuda_scheduling_examiner tool, but has | ||
4 | // been copied and heavily modified for use in the gpu-microbench project. | ||
3 | #include <cuda_runtime.h> | 5 | #include <cuda_runtime.h> |
4 | #include <errno.h> | 6 | #include <errno.h> |
5 | #include <stdint.h> | 7 | #include <stdint.h> |
@@ -46,14 +48,21 @@ static double CurrentSeconds(void) { | |||
46 | static __device__ inline uint64_t GlobalTimer64(void) { | 48 | static __device__ inline uint64_t GlobalTimer64(void) { |
47 | uint32_t lo_bits, hi_bits, hi_bits_2; | 49 | uint32_t lo_bits, hi_bits, hi_bits_2; |
48 | uint64_t ret; | 50 | uint64_t ret; |
49 | // Upper bits may rollover between our 1st and 2nd read | 51 | // The clock can roll over while we read the lo_bits in two cases: |
50 | // (The bug seems constrained to certain old Jetson boards, so this | 52 | // 1. We read the hi_bits right as they changed, and lo_bits should be 0. |
51 | // workaround could probably be gated to only those GPUs.) | 53 | // 2. We were preempted between reading hi_bits and hi_bits_2. In this case |
54 | // we need to re-read lo_bits, as they could be anything. Note that we | ||
55 | // assume that we'll never be suspended in an instruction-level preemption | ||
56 | // for more than ~4.29 seconds (which is how long it would take for the | ||
57 | // high bits to roll over again)---this allows us to just reread lo_bits. | ||
58 | // We also assume that this function will be preempted at at most one | ||
59 | // point during each invocation (the default timeslice length is 1-2 ms, | ||
60 | // so this assumption should always hold with the default scheduler). | ||
52 | asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits)); | 61 | asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits)); |
53 | asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits)); | 62 | asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits)); |
54 | asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits_2)); | 63 | asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits_2)); |
55 | // If upper bits rolled over, lo_bits = 0 | 64 | if (hi_bits != hi_bits_2) |
56 | lo_bits = (hi_bits != hi_bits_2) ? 0 : lo_bits; | 65 | asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits)); |
57 | // SASS on older architectures (such as sm_52) is natively 32-bit, so the | 66 | // SASS on older architectures (such as sm_52) is natively 32-bit, so the |
58 | // following three lines get optimized out. | 67 | // following three lines get optimized out. |
59 | ret = hi_bits_2; | 68 | ret = hi_bits_2; |