diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2024-02-19 20:39:39 -0500 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2024-02-19 20:39:39 -0500 |
commit | cd9ee070e2fcaa49fe35944ea0fd60ed5d197ba2 (patch) | |
tree | cb31473e1793e383a06dd0b560c7ab83374040ff | |
parent | 18641058a2d60e172f18176b41d51baa706ffd85 (diff) |
Split SAFE() macro into API-specific typed variants
Avoids incorrectly interpreting CUDA Driver Library error codes as
though they were CUDA Runtime Library error codes (the numbering is
similar, but not identical).
Also cleans up how we initialize and terminate a context for
capability checking in `mon_cross_ctx_copies`.
-rw-r--r-- | constant_cycles_kernel.cu | 1 | ||||
-rw-r--r-- | copy_experiments/mon_cross_ctx_copies.cu | 19 | ||||
-rw-r--r-- | preemption_logger.cu | 3 | ||||
-rw-r--r-- | task_host_utilities.cu | 6 | ||||
-rw-r--r-- | testbench.h | 30 |
5 files changed, 44 insertions, 15 deletions
diff --git a/constant_cycles_kernel.cu b/constant_cycles_kernel.cu index eda963d..db92b08 100644 --- a/constant_cycles_kernel.cu +++ b/constant_cycles_kernel.cu | |||
@@ -13,7 +13,6 @@ __global__ void loop_on_gpu(unsigned long iters, int *__unused) { | |||
13 | } | 13 | } |
14 | 14 | ||
15 | int main(int argc, char **argv) { | 15 | int main(int argc, char **argv) { |
16 | cudaError_t err; | ||
17 | int res, *__unused; | 16 | int res, *__unused; |
18 | struct timespec start, end; | 17 | struct timespec start, end; |
19 | 18 | ||
diff --git a/copy_experiments/mon_cross_ctx_copies.cu b/copy_experiments/mon_cross_ctx_copies.cu index d9749e0..93d0e4c 100644 --- a/copy_experiments/mon_cross_ctx_copies.cu +++ b/copy_experiments/mon_cross_ctx_copies.cu | |||
@@ -92,7 +92,6 @@ void cpu_copy_mon(int loops, char* cpu_mem) { | |||
92 | void* copy_thread(void* args_raw) { | 92 | void* copy_thread(void* args_raw) { |
93 | CUcontext ctx; | 93 | CUcontext ctx; |
94 | int dev, i; | 94 | int dev, i; |
95 | cudaError_t err; | ||
96 | int *barrier; | 95 | int *barrier; |
97 | int *barrier_dev_ptr; // On Pascal+ this will be the same as *barrier | 96 | int *barrier_dev_ptr; // On Pascal+ this will be the same as *barrier |
98 | char *pinned_hostmem, *devmem; | 97 | char *pinned_hostmem, *devmem; |
@@ -106,8 +105,8 @@ void* copy_thread(void* args_raw) { | |||
106 | // Explictly create a context (avoids creating a primary context implictly) | 105 | // Explictly create a context (avoids creating a primary context implictly) |
107 | // This has been verified on CUDA 11.1 to give each thread a different context | 106 | // This has been verified on CUDA 11.1 to give each thread a different context |
108 | // handle | 107 | // handle |
109 | cudaGetDevice(&dev); | 108 | SAFE(cudaGetDevice(&dev)); |
110 | SAFE(cuCtxCreate(&ctx, 0, dev)); | 109 | SAFE_D(cuCtxCreate(&ctx, 0, dev)); |
111 | 110 | ||
112 | uint64_t dev_ns, dev_ns2; | 111 | uint64_t dev_ns, dev_ns2; |
113 | double host_s, host_s2; | 112 | double host_s, host_s2; |
@@ -249,21 +248,27 @@ static error_t arg_parser(int key, char* arg, struct argp_state *state) { | |||
249 | } | 248 | } |
250 | 249 | ||
251 | int main(int argc, char**argv) { | 250 | int main(int argc, char**argv) { |
252 | int tmp, dev; | 251 | int tmp, dev = 0; |
252 | CUdevice dev_itrl; | ||
253 | uint64_t *ctx_times[MAX_THREADS] = {0}; | 253 | uint64_t *ctx_times[MAX_THREADS] = {0}; |
254 | pthread_t t[MAX_THREADS]; | 254 | pthread_t t[MAX_THREADS]; |
255 | global_args_t g_args = {0}; | 255 | global_args_t g_args = {0}; |
256 | 256 | ||
257 | // Temporarially initialize CUDA to query device attributes | ||
258 | SAFE_D(cuInit(0)); | ||
259 | SAFE_D(cuDeviceGet(&dev_itrl, dev)); | ||
257 | // Due to some laziness in how we handle barriers, this flag needs to be true | 260 | // Due to some laziness in how we handle barriers, this flag needs to be true |
258 | /// XXX: Still seems to work fine if it isn't??? | 261 | /// XXX: Still seems to work fine if it isn't??? |
259 | cudaGetDevice(&dev); | 262 | SAFE_D(cuDeviceGetAttribute(&tmp, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev_itrl)); |
260 | cudaDeviceGetAttribute(&tmp, cudaDevAttrCanMapHostMemory, dev); | ||
261 | if (!tmp) { | 263 | if (!tmp) { |
262 | fprintf(stderr, "Unsupported platform. It must be possible to map host (CPU)" | 264 | fprintf(stderr, "Unsupported platform. It must be possible to map host (CPU)" |
263 | "DRAM into the GPU virtual address space for accurate clock" | 265 | " DRAM into the GPU virtual address space for accurate clock " |
264 | "synchronization. Exiting...\n"); | 266 | "synchronization. Exiting...\n"); |
265 | return 1; | 267 | return 1; |
266 | } | 268 | } |
269 | // Terminate the context used for attrib check so it's not accidentially | ||
270 | // reused in subprocesses | ||
271 | SAFE_D(cuDevicePrimaryCtxRelease(dev_itrl)); | ||
267 | 272 | ||
268 | struct argp argp = {opts, arg_parser, 0, desc}; | 273 | struct argp argp = {opts, arg_parser, 0, desc}; |
269 | argp_parse(&argp, argc, argv, 0, 0, &g_args); | 274 | argp_parse(&argp, argc, argv, 0, 0, &g_args); |
diff --git a/preemption_logger.cu b/preemption_logger.cu index 667c459..d93600e 100644 --- a/preemption_logger.cu +++ b/preemption_logger.cu | |||
@@ -27,7 +27,7 @@ struct interval { | |||
27 | }; | 27 | }; |
28 | 28 | ||
29 | // Minimum time discontinuity which indicates a gap between intervals | 29 | // Minimum time discontinuity which indicates a gap between intervals |
30 | // One clock tick is about 1ns | 30 | // Clock resolution is about 1ns, but it only ticks every 1us pre-H100 |
31 | #define MIN_PREEMPT_TICKS 2*1000 // ~2us | 31 | #define MIN_PREEMPT_TICKS 2*1000 // ~2us |
32 | 32 | ||
33 | // Watch for discontinuities in the GPU clock, indicating intervals during | 33 | // Watch for discontinuities in the GPU clock, indicating intervals during |
@@ -67,7 +67,6 @@ Spin on the GPU, logging intervals to stdout during which we are scheduled.\n\ | |||
67 | -r, --raw Print raw logged GPU times (skip conversion to CPU time).\n"; | 67 | -r, --raw Print raw logged GPU times (skip conversion to CPU time).\n"; |
68 | 68 | ||
69 | int main(int argc, char **argv) { | 69 | int main(int argc, char **argv) { |
70 | cudaError_t err; // Needed for the SAFE() macro | ||
71 | struct interval *ivls_gpu, *ivls; | 70 | struct interval *ivls_gpu, *ivls; |
72 | struct timespec start, end, end_ivls_only; | 71 | struct timespec start, end, end_ivls_only; |
73 | int num_ivls, skip_conversion; | 72 | int num_ivls, skip_conversion; |
diff --git a/task_host_utilities.cu b/task_host_utilities.cu index 9bb95aa..1f76080 100644 --- a/task_host_utilities.cu +++ b/task_host_utilities.cu | |||
@@ -4,6 +4,7 @@ | |||
4 | // been copied and heavily modified for use in the gpu-microbench project. | 4 | // been copied and heavily modified for use in the gpu-microbench project. |
5 | #include <cuda_runtime.h> | 5 | #include <cuda_runtime.h> |
6 | #include <errno.h> | 6 | #include <errno.h> |
7 | #include <sched.h> | ||
7 | #include <stdint.h> | 8 | #include <stdint.h> |
8 | #include <stdio.h> | 9 | #include <stdio.h> |
9 | #include <string.h> | 10 | #include <string.h> |
@@ -191,8 +192,8 @@ static void InternalReadGPUNanoseconds(int cuda_device, double *cpu_time, | |||
191 | if (!CheckCUDAError(cudaMemcpy(gpu_time, device_time, sizeof(device_time), | 192 | if (!CheckCUDAError(cudaMemcpy(gpu_time, device_time, sizeof(device_time), |
192 | cudaMemcpyDeviceToHost))) goto out; | 193 | cudaMemcpyDeviceToHost))) goto out; |
193 | max_error = (cpu_end - cpu_start) / 2.0; | 194 | max_error = (cpu_end - cpu_start) / 2.0; |
194 | fprintf(stderr, "Time synchronized to a maximum error of +/- %f us.\n", | 195 | fprintf(stderr, "Time synchronized to a maximum error of +/- %f us on CPU%d.\n", |
195 | max_error * (1000.0 * 1000.0)); | 196 | max_error * (1000.0 * 1000.0), sched_getcpu()); |
196 | out: | 197 | out: |
197 | cudaFree(device_time); | 198 | cudaFree(device_time); |
198 | cudaFree((void*)start_barrier); | 199 | cudaFree((void*)start_barrier); |
@@ -200,6 +201,7 @@ out: | |||
200 | cudaFree((void*)ready_barrier); | 201 | cudaFree((void*)ready_barrier); |
201 | } | 202 | } |
202 | 203 | ||
204 | // Returns 0 on success, 1 on error | ||
203 | int GetHostDeviceTimeOffset(int cuda_device, double *host_seconds, | 205 | int GetHostDeviceTimeOffset(int cuda_device, double *host_seconds, |
204 | uint64_t *gpu_nanoseconds) { | 206 | uint64_t *gpu_nanoseconds) { |
205 | uint64_t *shared_gpu_time = NULL; | 207 | uint64_t *shared_gpu_time = NULL; |
diff --git a/testbench.h b/testbench.h index 5b47bd4..5e77410 100644 --- a/testbench.h +++ b/testbench.h | |||
@@ -2,10 +2,34 @@ | |||
2 | * Header for miscellaneous experimental helper functions. | 2 | * Header for miscellaneous experimental helper functions. |
3 | */ | 3 | */ |
4 | 4 | ||
5 | // cudaError_t and CUResult can both safely be cast to an unsigned int | ||
6 | static __thread unsigned int __SAFE_err; | ||
7 | |||
8 | // The very strange cast in these macros is to satisfy two goals at tension: | ||
9 | // 1. This file should be able to be included in non-CUDA-using files, and thus | ||
10 | // should use no CUDA types outside of this macro. | ||
11 | // 2. We want to typecheck uses of these macros. The driver and runtime APIs | ||
12 | // do not have identical error numbers and/or meanings, so runtime library | ||
13 | // calls should use SAFE, and driver library calls should use SAFE_D. | ||
14 | // Our design allows typechecking while keeping a non-CUDA per-thread error var. | ||
15 | |||
16 | // For CUDA Runtime Library functions; typically those prefixed with `cuda` | ||
5 | #define SAFE(x) \ | 17 | #define SAFE(x) \ |
6 | if ((err = (cudaError_t)(x)) != 0) { \ | 18 | if ((*(cudaError_t*)(&__SAFE_err) = (x)) != 0) { \ |
7 | printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \ | 19 | printf("(%s:%d) CUDA error %d: %s i.e. \"%s\" returned by %s. Aborting...\n", \ |
8 | printf("Suspect line: %s\n", #x); \ | 20 | __FILE__, __LINE__, __SAFE_err, cudaGetErrorName((cudaError_t)__SAFE_err), cudaGetErrorString((cudaError_t)__SAFE_err), #x); \ |
21 | exit(1); \ | ||
22 | } | ||
23 | |||
24 | // For CUDA Driver Library functions; typically those prefixed with just `cu` | ||
25 | #define SAFE_D(x) \ | ||
26 | if ((*(CUresult*)&(__SAFE_err) = (x)) != 0) { \ | ||
27 | const char* name; \ | ||
28 | const char* desc; \ | ||
29 | cuGetErrorName((CUresult)__SAFE_err, &name); \ | ||
30 | cuGetErrorString((CUresult)__SAFE_err, &desc); \ | ||
31 | printf("(%s:%d) CUDA error %d: %s i.e. \"%s\" returned by %s. Aborting...\n", \ | ||
32 | __FILE__, __LINE__, __SAFE_err, name, desc, #x); \ | ||
9 | exit(1); \ | 33 | exit(1); \ |
10 | } | 34 | } |
11 | 35 | ||