summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2024-02-19 20:39:39 -0500
committerJoshua Bakita <bakitajoshua@gmail.com>2024-02-19 20:39:39 -0500
commitcd9ee070e2fcaa49fe35944ea0fd60ed5d197ba2 (patch)
treecb31473e1793e383a06dd0b560c7ab83374040ff
parent18641058a2d60e172f18176b41d51baa706ffd85 (diff)
Split SAFE() macro into API-specific typed variants
Avoids incorrectly interpreting CUDA Driver Library error codes as though they were CUDA Runtime Library error codes (the numbering is similar, but not identical). Also cleans up how we initialize and terminate a context for capability checking in `mon_cross_ctx_copies`.
-rw-r--r--constant_cycles_kernel.cu1
-rw-r--r--copy_experiments/mon_cross_ctx_copies.cu19
-rw-r--r--preemption_logger.cu3
-rw-r--r--task_host_utilities.cu6
-rw-r--r--testbench.h30
5 files changed, 44 insertions, 15 deletions
diff --git a/constant_cycles_kernel.cu b/constant_cycles_kernel.cu
index eda963d..db92b08 100644
--- a/constant_cycles_kernel.cu
+++ b/constant_cycles_kernel.cu
@@ -13,7 +13,6 @@ __global__ void loop_on_gpu(unsigned long iters, int *__unused) {
13} 13}
14 14
15int main(int argc, char **argv) { 15int main(int argc, char **argv) {
16 cudaError_t err;
17 int res, *__unused; 16 int res, *__unused;
18 struct timespec start, end; 17 struct timespec start, end;
19 18
diff --git a/copy_experiments/mon_cross_ctx_copies.cu b/copy_experiments/mon_cross_ctx_copies.cu
index d9749e0..93d0e4c 100644
--- a/copy_experiments/mon_cross_ctx_copies.cu
+++ b/copy_experiments/mon_cross_ctx_copies.cu
@@ -92,7 +92,6 @@ void cpu_copy_mon(int loops, char* cpu_mem) {
92void* copy_thread(void* args_raw) { 92void* copy_thread(void* args_raw) {
93 CUcontext ctx; 93 CUcontext ctx;
94 int dev, i; 94 int dev, i;
95 cudaError_t err;
96 int *barrier; 95 int *barrier;
97 int *barrier_dev_ptr; // On Pascal+ this will be the same as *barrier 96 int *barrier_dev_ptr; // On Pascal+ this will be the same as *barrier
98 char *pinned_hostmem, *devmem; 97 char *pinned_hostmem, *devmem;
@@ -106,8 +105,8 @@ void* copy_thread(void* args_raw) {
106 // Explictly create a context (avoids creating a primary context implictly) 105 // Explictly create a context (avoids creating a primary context implictly)
107 // This has been verified on CUDA 11.1 to give each thread a different context 106 // This has been verified on CUDA 11.1 to give each thread a different context
108 // handle 107 // handle
109 cudaGetDevice(&dev); 108 SAFE(cudaGetDevice(&dev));
110 SAFE(cuCtxCreate(&ctx, 0, dev)); 109 SAFE_D(cuCtxCreate(&ctx, 0, dev));
111 110
112 uint64_t dev_ns, dev_ns2; 111 uint64_t dev_ns, dev_ns2;
113 double host_s, host_s2; 112 double host_s, host_s2;
@@ -249,21 +248,27 @@ static error_t arg_parser(int key, char* arg, struct argp_state *state) {
249} 248}
250 249
251int main(int argc, char**argv) { 250int main(int argc, char**argv) {
252 int tmp, dev; 251 int tmp, dev = 0;
252 CUdevice dev_itrl;
253 uint64_t *ctx_times[MAX_THREADS] = {0}; 253 uint64_t *ctx_times[MAX_THREADS] = {0};
254 pthread_t t[MAX_THREADS]; 254 pthread_t t[MAX_THREADS];
255 global_args_t g_args = {0}; 255 global_args_t g_args = {0};
256 256
257 // Temporarially initialize CUDA to query device attributes
258 SAFE_D(cuInit(0));
259 SAFE_D(cuDeviceGet(&dev_itrl, dev));
257 // Due to some laziness in how we handle barriers, this flag needs to be true 260 // Due to some laziness in how we handle barriers, this flag needs to be true
258 /// XXX: Still seems to work fine if it isn't??? 261 /// XXX: Still seems to work fine if it isn't???
259 cudaGetDevice(&dev); 262 SAFE_D(cuDeviceGetAttribute(&tmp, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev_itrl));
260 cudaDeviceGetAttribute(&tmp, cudaDevAttrCanMapHostMemory, dev);
261 if (!tmp) { 263 if (!tmp) {
262 fprintf(stderr, "Unsupported platform. It must be possible to map host (CPU)" 264 fprintf(stderr, "Unsupported platform. It must be possible to map host (CPU)"
263 "DRAM into the GPU virtual address space for accurate clock" 265 " DRAM into the GPU virtual address space for accurate clock "
264 "synchronization. Exiting...\n"); 266 "synchronization. Exiting...\n");
265 return 1; 267 return 1;
266 } 268 }
269 // Terminate the context used for attrib check so it's not accidentially
270 // reused in subprocesses
271 SAFE_D(cuDevicePrimaryCtxRelease(dev_itrl));
267 272
268 struct argp argp = {opts, arg_parser, 0, desc}; 273 struct argp argp = {opts, arg_parser, 0, desc};
269 argp_parse(&argp, argc, argv, 0, 0, &g_args); 274 argp_parse(&argp, argc, argv, 0, 0, &g_args);
diff --git a/preemption_logger.cu b/preemption_logger.cu
index 667c459..d93600e 100644
--- a/preemption_logger.cu
+++ b/preemption_logger.cu
@@ -27,7 +27,7 @@ struct interval {
27}; 27};
28 28
29// Minimum time discontinuity which indicates a gap between intervals 29// Minimum time discontinuity which indicates a gap between intervals
30// One clock tick is about 1ns 30// Clock resolution is about 1ns, but it only ticks every 1us pre-H100
31#define MIN_PREEMPT_TICKS 2*1000 // ~2us 31#define MIN_PREEMPT_TICKS 2*1000 // ~2us
32 32
33// Watch for discontinuities in the GPU clock, indicating intervals during 33// Watch for discontinuities in the GPU clock, indicating intervals during
@@ -67,7 +67,6 @@ Spin on the GPU, logging intervals to stdout during which we are scheduled.\n\
67 -r, --raw Print raw logged GPU times (skip conversion to CPU time).\n"; 67 -r, --raw Print raw logged GPU times (skip conversion to CPU time).\n";
68 68
69int main(int argc, char **argv) { 69int main(int argc, char **argv) {
70 cudaError_t err; // Needed for the SAFE() macro
71 struct interval *ivls_gpu, *ivls; 70 struct interval *ivls_gpu, *ivls;
72 struct timespec start, end, end_ivls_only; 71 struct timespec start, end, end_ivls_only;
73 int num_ivls, skip_conversion; 72 int num_ivls, skip_conversion;
diff --git a/task_host_utilities.cu b/task_host_utilities.cu
index 9bb95aa..1f76080 100644
--- a/task_host_utilities.cu
+++ b/task_host_utilities.cu
@@ -4,6 +4,7 @@
4// been copied and heavily modified for use in the gpu-microbench project. 4// been copied and heavily modified for use in the gpu-microbench project.
5#include <cuda_runtime.h> 5#include <cuda_runtime.h>
6#include <errno.h> 6#include <errno.h>
7#include <sched.h>
7#include <stdint.h> 8#include <stdint.h>
8#include <stdio.h> 9#include <stdio.h>
9#include <string.h> 10#include <string.h>
@@ -191,8 +192,8 @@ static void InternalReadGPUNanoseconds(int cuda_device, double *cpu_time,
191 if (!CheckCUDAError(cudaMemcpy(gpu_time, device_time, sizeof(device_time), 192 if (!CheckCUDAError(cudaMemcpy(gpu_time, device_time, sizeof(device_time),
192 cudaMemcpyDeviceToHost))) goto out; 193 cudaMemcpyDeviceToHost))) goto out;
193 max_error = (cpu_end - cpu_start) / 2.0; 194 max_error = (cpu_end - cpu_start) / 2.0;
194 fprintf(stderr, "Time synchronized to a maximum error of +/- %f us.\n", 195 fprintf(stderr, "Time synchronized to a maximum error of +/- %f us on CPU%d.\n",
195 max_error * (1000.0 * 1000.0)); 196 max_error * (1000.0 * 1000.0), sched_getcpu());
196out: 197out:
197 cudaFree(device_time); 198 cudaFree(device_time);
198 cudaFree((void*)start_barrier); 199 cudaFree((void*)start_barrier);
@@ -200,6 +201,7 @@ out:
200 cudaFree((void*)ready_barrier); 201 cudaFree((void*)ready_barrier);
201} 202}
202 203
204// Returns 0 on success, 1 on error
203int GetHostDeviceTimeOffset(int cuda_device, double *host_seconds, 205int GetHostDeviceTimeOffset(int cuda_device, double *host_seconds,
204 uint64_t *gpu_nanoseconds) { 206 uint64_t *gpu_nanoseconds) {
205 uint64_t *shared_gpu_time = NULL; 207 uint64_t *shared_gpu_time = NULL;
diff --git a/testbench.h b/testbench.h
index 5b47bd4..5e77410 100644
--- a/testbench.h
+++ b/testbench.h
@@ -2,10 +2,34 @@
2 * Header for miscellaneous experimental helper functions. 2 * Header for miscellaneous experimental helper functions.
3 */ 3 */
4 4
5// cudaError_t and CUResult can both safely be cast to an unsigned int
6static __thread unsigned int __SAFE_err;
7
8// The very strange cast in these macros is to satisfy two goals at tension:
9// 1. This file should be able to be included in non-CUDA-using files, and thus
10// should use no CUDA types outside of this macro.
11// 2. We want to typecheck uses of these macros. The driver and runtime APIs
12// do not have identical error numbers and/or meanings, so runtime library
13// calls should use SAFE, and driver library calls should use SAFE_D.
14// Our design allows typechecking while keeping a non-CUDA per-thread error var.
15
16// For CUDA Runtime Library functions; typically those prefixed with `cuda`
5#define SAFE(x) \ 17#define SAFE(x) \
6 if ((err = (cudaError_t)(x)) != 0) { \ 18 if ((*(cudaError_t*)(&__SAFE_err) = (x)) != 0) { \
7 printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \ 19 printf("(%s:%d) CUDA error %d: %s i.e. \"%s\" returned by %s. Aborting...\n", \
8 printf("Suspect line: %s\n", #x); \ 20 __FILE__, __LINE__, __SAFE_err, cudaGetErrorName((cudaError_t)__SAFE_err), cudaGetErrorString((cudaError_t)__SAFE_err), #x); \
21 exit(1); \
22 }
23
24// For CUDA Driver Library functions; typically those prefixed with just `cu`
25#define SAFE_D(x) \
26 if ((*(CUresult*)&(__SAFE_err) = (x)) != 0) { \
27 const char* name; \
28 const char* desc; \
29 cuGetErrorName((CUresult)__SAFE_err, &name); \
30 cuGetErrorString((CUresult)__SAFE_err, &desc); \
31 printf("(%s:%d) CUDA error %d: %s i.e. \"%s\" returned by %s. Aborting...\n", \
32 __FILE__, __LINE__, __SAFE_err, name, desc, #x); \
9 exit(1); \ 33 exit(1); \
10 } 34 }
11 35