/** * Copyright 2022 Joshua Bakita * * Run many iterations of GPU paging on the same CUDA context, and print the * average time per iteration. */ #include #include #include // curandState_t and curand #include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h" #include #include // clock_gettime #include // ioctl #include // sleep #define s2ns(s) ((s)*1000l*1000l*1000l) #define ns2us(ns) ((ns)/1000l) #define GiB 1024l*1024l*1024l // Originally from copy_testbed.h in the copy_experiments set #define SAFE(x) \ if ((err = (cudaError_t)(x)) != 0) { \ printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \ printf("Suspect line: %s\n", #x); \ exit(1); \ } // Fill buffer with random bytes. Supports buffers >4GiB. // Original function on CPU part of copy_only.cu // @param buf Pointer to buffer // @param buf_len Length of buffer in bytes // @note Supports splitting the work across threads __device__ curandState_t rng_state; __global__ void fill_rand(char* buf, uint64_t buf_len) { uint64_t to = buf_len; uint64_t i = 0; if (blockDim.x > 1) { // Subdivide the work uint64_t chunk_sz = buf_len/blockDim.x; i = threadIdx.x * chunk_sz; to = threadIdx.x * chunk_sz + chunk_sz; // If buffer size doesn't evenly divide, make last thread get remaineder if (threadIdx.x + 1 == blockDim.x) { to = buf_len; } } for (; i < to; i++) buf[i] = max((curand(&rng_state) & 0xff), 1); } // Fill buffer with sequential quadwords // @param buf Pointer to buffer // @param buf_len Length of buffer in bytes __global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) { uint64_t i; for (i = 0; i < buf_len; i++) buf[i] = start_num++; } __device__ uint64_t gpu_res; // Count number of zeros in a buffer // @param buf Pointer to buffer // @param buf_len Length of buffer in bytes // @return via gpu_res Number of zeros found // @note Supports splitting the work across threads __global__ void count_zero(char* buf, uint64_t buf_len) { gpu_res = 0; uint64_t to = buf_len; uint64_t i = 0; if (blockDim.x > 1) { // Subdivide the work uint64_t chunk_sz = buf_len/blockDim.x; i = threadIdx.x * chunk_sz; to = threadIdx.x * chunk_sz + chunk_sz; // If buffer size doesn't evenly divide, make last thread get remaineder if (threadIdx.x + 1 == blockDim.x) { to = buf_len; } } uint64_t num_zero; for (; i < to; i++) num_zero += (!buf[i]); // Cast shouldn't strictly be needed, but won't build without... atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero); } // Subtract first parameter from second parameter. Return as nanoseconds. long time_diff_ns(struct timespec start, struct timespec stop) { return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec); } int main(int argc, char **argv) { char* gpu_buf; struct timespec out_start, out_stop; int res; cudaStream_t stream1; cudaError_t err; int iters; if (argc != 2 || argv[1][0] == '-') { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } iters = atoi(argv[1]); SAFE(cudaStreamCreate(&stream1)); SAFE(cudaMalloc(&gpu_buf, GiB)); // Fill buffer with data fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB); SAFE(cudaStreamSynchronize(stream1)); clock_gettime(CLOCK_MONOTONIC_RAW, &out_start); for (int i = 0; i < iters; i++) { // Copy out struct nvgpu_as_swap_buffer_args ioctl_arg = {1160}; res = ioctl(6, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER, &ioctl_arg); if (res) { perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER"); return res; } // Copy in res = ioctl(6, NVGPU_AS_IOCTL_READ_SWAP_BUFFER, &ioctl_arg); if (res) { perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER"); return res; } } clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop); // Check for valid contents count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB); SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost)); SAFE(cudaStreamSynchronize(stream1)); if (res > 0) { fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res); return 1; } long duration = ns2us(time_diff_ns(out_start, out_stop)); printf("Took %ldus to do %d paging loops (%.2f us per loop)\n", duration, iters, duration/(float)iters); cudaFree(gpu_buf); }