Split SAFE() macro into API-specific typed variants

Avoids incorrectly interpreting CUDA Driver Library error codes as though they were CUDA Runtime Library error codes (the numbering is similar, but not identical). Also cleans up how we initialize and terminate a context for capability checking in `mon_cross_ctx_copies`.
author: Joshua Bakita <bakitajoshua@gmail.com> 2024-02-19 20:39:39 -0500
committer: Joshua Bakita <bakitajoshua@gmail.com> 2024-02-19 20:39:39 -0500
commit: cd9ee070e2fcaa49fe35944ea0fd60ed5d197ba2 (patch)
tree: cb31473e1793e383a06dd0b560c7ab83374040ff
parent: 18641058a2d60e172f18176b41d51baa706ffd85 (diff)
5 files changed, 44 insertions, 15 deletions
diff --git a/constant_cycles_kernel.cu b/constant_cycles_kernel.cu
index eda963d..db92b08 100644
--- a/constant_cycles_kernel.cu
+++ b/constant_cycles_kernel.cu
@@ -13,7 +13,6 @@ __global__ void loop_on_gpu(unsigned long iters, int *__unused) {
 }
 int main(int argc, char **argv) {
-        cudaError_t err;
        int res, *__unused;
        struct timespec start, end;
diff --git a/copy_experiments/mon_cross_ctx_copies.cu b/copy_experiments/mon_cross_ctx_copies.cu
index d9749e0..93d0e4c 100644
--- a/copy_experiments/mon_cross_ctx_copies.cu
+++ b/copy_experiments/mon_cross_ctx_copies.cu
@@ -92,7 +92,6 @@ void cpu_copy_mon(int loops, char* cpu_mem) {
 void* copy_thread(void* args_raw) {
        CUcontext ctx;
        int dev, i;
-        cudaError_t err;
        int *barrier;
        int *barrier_dev_ptr; // On Pascal+ this will be the same as *barrier
        char *pinned_hostmem, *devmem;
@@ -106,8 +105,8 @@ void* copy_thread(void* args_raw) {
        // Explictly create a context (avoids creating a primary context implictly)
        // This has been verified on CUDA 11.1 to give each thread a different context
        // handle
-        cudaGetDevice(&dev);
+        SAFE(cudaGetDevice(&dev));
-        SAFE(cuCtxCreate(&ctx, 0, dev));
+        SAFE_D(cuCtxCreate(&ctx, 0, dev));
        uint64_t dev_ns, dev_ns2;
        double host_s, host_s2;
@@ -249,21 +248,27 @@ static error_t arg_parser(int key, char* arg, struct argp_state *state) {
 }
 int main(int argc, char**argv) {
-        int tmp, dev;
+        int tmp, dev = 0;
+        CUdevice dev_itrl;
        uint64_t *ctx_times[MAX_THREADS] = {0};
        pthread_t t[MAX_THREADS];
        global_args_t g_args = {0};
+        // Temporarially initialize CUDA to query device attributes
+        SAFE_D(cuInit(0));
+        SAFE_D(cuDeviceGet(&dev_itrl, dev));
        // Due to some laziness in how we handle barriers, this flag needs to be true
        /// XXX: Still seems to work fine if it isn't???
-        cudaGetDevice(&dev);
+        SAFE_D(cuDeviceGetAttribute(&tmp, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev_itrl));
-        cudaDeviceGetAttribute(&tmp, cudaDevAttrCanMapHostMemory, dev);
        if (!tmp) {
                fprintf(stderr, "Unsupported platform. It must be possible to map host (CPU)"
-                                "DRAM into the GPU virtual address space for accurate clock"
+                                " DRAM into the GPU virtual address space for accurate clock "
                                "synchronization. Exiting...\n");
                return 1;
        }
+        // Terminate the context used for attrib check so it's not accidentially
+        // reused in subprocesses
+        SAFE_D(cuDevicePrimaryCtxRelease(dev_itrl));
        struct argp argp = {opts, arg_parser, 0, desc};
        argp_parse(&argp, argc, argv, 0, 0, &g_args);
diff --git a/preemption_logger.cu b/preemption_logger.cu
index 667c459..d93600e 100644
--- a/preemption_logger.cu
+++ b/preemption_logger.cu
@@ -27,7 +27,7 @@ struct interval {
 };
 // Minimum time discontinuity which indicates a gap between intervals
-// One clock tick is about 1ns
+// Clock resolution is about 1ns, but it only ticks every 1us pre-H100
 #define MIN_PREEMPT_TICKS 2*1000 // ~2us
 // Watch for discontinuities in the GPU clock, indicating intervals during
@@ -67,7 +67,6 @@ Spin on the GPU, logging intervals to stdout during which we are scheduled.\n\
  -r, --raw       Print raw logged GPU times (skip conversion to CPU time).\n";
 int main(int argc, char **argv) {
-        cudaError_t err; // Needed for the SAFE() macro
        struct interval *ivls_gpu, *ivls;
        struct timespec start, end, end_ivls_only;
        int num_ivls, skip_conversion;
diff --git a/task_host_utilities.cu b/task_host_utilities.cu
index 9bb95aa..1f76080 100644
--- a/task_host_utilities.cu
+++ b/task_host_utilities.cu
@@ -4,6 +4,7 @@
 // been copied and heavily modified for use in the gpu-microbench project.
 #include <cuda_runtime.h>
 #include <errno.h>
+#include <sched.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
@@ -191,8 +192,8 @@ static void InternalReadGPUNanoseconds(int cuda_device, double *cpu_time,
  if (!CheckCUDAError(cudaMemcpy(gpu_time, device_time, sizeof(device_time),
    cudaMemcpyDeviceToHost))) goto out;
  max_error = (cpu_end - cpu_start) / 2.0;
-  fprintf(stderr, "Time synchronized to a maximum error of +/- %f us.\n",
+  fprintf(stderr, "Time synchronized to a maximum error of +/- %f us on CPU%d.\n",
-    max_error * (1000.0 * 1000.0));
+    max_error * (1000.0 * 1000.0), sched_getcpu());
 out:
  cudaFree(device_time);
  cudaFree((void*)start_barrier);
@@ -200,6 +201,7 @@ out:
  cudaFree((void*)ready_barrier);
 }
+// Returns 0 on success, 1 on error
 int GetHostDeviceTimeOffset(int cuda_device, double *host_seconds,
  uint64_t *gpu_nanoseconds) {
  uint64_t *shared_gpu_time = NULL;
diff --git a/testbench.h b/testbench.h
index 5b47bd4..5e77410 100644
--- a/testbench.h
+++ b/testbench.h
@@ -2,10 +2,34 @@
 * Header for miscellaneous experimental helper functions.
 */
+// cudaError_t and CUResult can both safely be cast to an unsigned int
+static __thread unsigned int __SAFE_err;
+// The very strange cast in these macros is to satisfy two goals at tension:
+// 1. This file should be able to be included in non-CUDA-using files, and thus
+//    should use no CUDA types outside of this macro.
+// 2. We want to typecheck uses of these macros. The driver and runtime APIs
+//    do not have identical error numbers and/or meanings, so runtime library
+//    calls should use SAFE, and driver library calls should use SAFE_D.
+// Our design allows typechecking while keeping a non-CUDA per-thread error var.
+// For CUDA Runtime Library functions; typically those prefixed with `cuda`
 #define SAFE(x) \
-        if ((err = (cudaError_t)(x)) != 0) { \
+        if ((*(cudaError_t*)(&__SAFE_err) = (x)) != 0) { \
-                printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \
+                printf("(%s:%d) CUDA error %d: %s i.e. \"%s\" returned by %s. Aborting...\n", \
-                printf("Suspect line: %s\n", #x); \
+                       __FILE__, __LINE__, __SAFE_err, cudaGetErrorName((cudaError_t)__SAFE_err), cudaGetErrorString((cudaError_t)__SAFE_err), #x); \
+                exit(1); \
+        }
+// For CUDA Driver Library functions; typically those prefixed with just `cu`
+#define SAFE_D(x) \
+        if ((*(CUresult*)&(__SAFE_err) = (x)) != 0) { \
+                const char* name; \
+                const char* desc; \
+                cuGetErrorName((CUresult)__SAFE_err, &name); \
+                cuGetErrorString((CUresult)__SAFE_err, &desc); \
+                printf("(%s:%d) CUDA error %d: %s i.e. \"%s\" returned by %s. Aborting...\n", \
+                       __FILE__, __LINE__, __SAFE_err, name, desc, #x); \
                exit(1); \
        }
author	Joshua Bakita <bakitajoshua@gmail.com>	2024-02-19 20:39:39 -0500
committer	Joshua Bakita <bakitajoshua@gmail.com>	2024-02-19 20:39:39 -0500
commit	cd9ee070e2fcaa49fe35944ea0fd60ed5d197ba2 (patch)
tree	cb31473e1793e383a06dd0b560c7ab83374040ff
parent	18641058a2d60e172f18176b41d51baa706ffd85 (diff)