summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--task_host_utilities.cu23
1 files changed, 16 insertions, 7 deletions
diff --git a/task_host_utilities.cu b/task_host_utilities.cu
index e35292f..9bb95aa 100644
--- a/task_host_utilities.cu
+++ b/task_host_utilities.cu
@@ -1,5 +1,7 @@
1// This file contains the implementation of the functions defined in 1// Copyright 2017-2019 Nathan Otterness
2// task_host_utilities.h--used by task_host_utilities.c to work with the GPU. 2// Copyright 2020-2024 Joshua Bakita
3// This file was originally part of the cuda_scheduling_examiner tool, but has
4// been copied and heavily modified for use in the gpu-microbench project.
3#include <cuda_runtime.h> 5#include <cuda_runtime.h>
4#include <errno.h> 6#include <errno.h>
5#include <stdint.h> 7#include <stdint.h>
@@ -46,14 +48,21 @@ static double CurrentSeconds(void) {
46static __device__ inline uint64_t GlobalTimer64(void) { 48static __device__ inline uint64_t GlobalTimer64(void) {
47 uint32_t lo_bits, hi_bits, hi_bits_2; 49 uint32_t lo_bits, hi_bits, hi_bits_2;
48 uint64_t ret; 50 uint64_t ret;
49 // Upper bits may rollover between our 1st and 2nd read 51 // The clock can roll over while we read the lo_bits in two cases:
50 // (The bug seems constrained to certain old Jetson boards, so this 52 // 1. We read the hi_bits right as they changed, and lo_bits should be 0.
51 // workaround could probably be gated to only those GPUs.) 53 // 2. We were preempted between reading hi_bits and hi_bits_2. In this case
54 // we need to re-read lo_bits, as they could be anything. Note that we
55 // assume that we'll never be suspended in an instruction-level preemption
56 // for more than ~4.29 seconds (which is how long it would take for the
57 // high bits to roll over again)---this allows us to just reread lo_bits.
58 // We also assume that this function will be preempted at at most one
59 // point during each invocation (the default timeslice length is 1-2 ms,
60 // so this assumption should always hold with the default scheduler).
52 asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits)); 61 asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits));
53 asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits)); 62 asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits));
54 asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits_2)); 63 asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(hi_bits_2));
55 // If upper bits rolled over, lo_bits = 0 64 if (hi_bits != hi_bits_2)
56 lo_bits = (hi_bits != hi_bits_2) ? 0 : lo_bits; 65 asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(lo_bits));
57 // SASS on older architectures (such as sm_52) is natively 32-bit, so the 66 // SASS on older architectures (such as sm_52) is natively 32-bit, so the
58 // following three lines get optimized out. 67 // following three lines get optimized out.
59 ret = hi_bits_2; 68 ret = hi_bits_2;