/**
* Copyright 2022 Joshua Bakita
*/
#include <stdio.h>
#include <cuda.h>
#include <curand_kernel.h> // curandState_t and curand
#include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h"
#include <errno.h>
#include <time.h> // clock_gettime
#include <sys/ioctl.h> // ioctl
#include <unistd.h> // sleep
#define s2ns(s) ((s)*1000l*1000l*1000l)
#define ns2us(ns) ((ns)/1000l)
#define GiB 1024l*1024l*1024l
// Originally from copy_testbed.h in the copy_experiments set
#define SAFE(x) \
if ((err = (cudaError_t)(x)) != 0) { \
printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \
printf("Suspect line: %s\n", #x); \
exit(1); \
}
// Fill buffer with random bytes. Supports buffers >4GiB.
// Original function on CPU part of copy_only.cu
// @param buf Pointer to buffer
// @param buf_len Length of buffer in bytes
// @note Supports splitting the work across threads
__device__ curandState_t rng_state;
__global__ void fill_rand(char* buf, uint64_t buf_len) {
uint64_t to = buf_len;
uint64_t i = 0;
if (blockDim.x > 1) {
// Subdivide the work
uint64_t chunk_sz = buf_len/blockDim.x;
i = threadIdx.x * chunk_sz;
to = threadIdx.x * chunk_sz + chunk_sz;
// If buffer size doesn't evenly divide, make last thread get remaineder
if (threadIdx.x + 1 == blockDim.x) {
to = buf_len;
}
}
for (; i < to; i++)
buf[i] = max((curand(&rng_state) & 0xff), 1);
}
// Fill buffer with sequential quadwords
// @param buf Pointer to buffer
// @param buf_len Length of buffer in bytes
__global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) {
uint64_t i;
for (i = 0; i < buf_len; i++)
buf[i] = start_num++;
}
__device__ uint64_t gpu_res;
// Count number of zeros in a buffer
// @param buf Pointer to buffer
// @param buf_len Length of buffer in bytes
// @return via gpu_res Number of zeros found
// @note Supports splitting the work across threads
__global__ void count_zero(char* buf, uint64_t buf_len) {
gpu_res = 0;
uint64_t to = buf_len;
uint64_t i = 0;
if (blockDim.x > 1) {
// Subdivide the work
uint64_t chunk_sz = buf_len/blockDim.x;
i = threadIdx.x * chunk_sz;
to = threadIdx.x * chunk_sz + chunk_sz;
// If buffer size doesn't evenly divide, make last thread get remaineder
if (threadIdx.x + 1 == blockDim.x) {
to = buf_len;
}
}
uint64_t num_zero;
for (; i < to; i++)
num_zero += (!buf[i]);
// Cast shouldn't strictly be needed, but won't build without...
atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero);
}
// Subtract first parameter from second parameter. Return as nanoseconds.
long time_diff_ns(struct timespec start, struct timespec stop) {
return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
}
int main(int argc, char **argv) {
char* gpu_buf;
struct timespec out_start, out_stop, in_start, in_stop;
int res;
cudaStream_t stream1;
cudaError_t err;
SAFE(cudaStreamCreate(&stream1));
SAFE(cudaMalloc(&gpu_buf, GiB));
// Fill buffer with data
fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB);
SAFE(cudaStreamSynchronize(stream1));
// Reset sector assignments (does not fail)
ioctl(6, NVGPU_AS_IOCTL_SWAP_RESET);
// Copy out
struct nvgpu_as_swap_buffer_args ioctl_arg = {1160};
clock_gettime(CLOCK_MONOTONIC_RAW, &out_start);
res = ioctl(6, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER, &ioctl_arg);
clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop);
if (res) {
perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER");
return res;
}
sleep(1); // Supposedly some other work would happen here
// Copy in
clock_gettime(CLOCK_MONOTONIC_RAW, &in_start);
res = ioctl(6, NVGPU_AS_IOCTL_READ_SWAP_BUFFER, &ioctl_arg);
clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop);
if (res) {
perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER");
return res;
}
// Check for valid contents
count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB);
SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost));
SAFE(cudaStreamSynchronize(stream1));
if (res > 0) {
fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res);
return 1;
}
// Print results as tab-seperated-values
printf("out (us)\tin (us)\n");
printf("%ld\t%ld\n", ns2us(time_diff_ns(out_start, out_stop)),
ns2us(time_diff_ns(in_start, in_stop)));
cudaFree(gpu_buf);
}