Initial commit of tools as used for submission.

author: Joshua Bakita <jbakita@cs.unc.edu> 2022-09-12 10:47:56 -0400
committer: Joshua Bakita <jbakita@cs.unc.edu> 2022-09-12 10:47:56 -0400
commit: a6286e09f4a3c78522a12b3d55b53ef1245bf558 (patch)
tree: 1f20908883b3c4989d51bc66b655bfe258cba15d
6 files changed, 717 insertions, 0 deletions
diff --git a/bomb.c b/bomb.c
new file mode 100644
index 0000000..791e4f9
--- /dev/null
+++ b/bomb.c
@@ -0,0 +1,9 @@
+#include <stdint.h>
+#include <stdlib.h>
+#define SZ 1024ull*1024ull*1024ull*16ull
+int main() {
+        char* ptr = malloc(SZ);
+        for (uint64_t i = 0; i < SZ; i+=4096)
+                ptr[i] = i;
+}
diff --git a/directio_paging_speed.c b/directio_paging_speed.c
new file mode 100644
index 0000000..b0a01d3
--- /dev/null
+++ b/directio_paging_speed.c
@@ -0,0 +1,129 @@
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <time.h>
+#include <unistd.h>
+#include <stdlib.h>
+#define GiB 1024l*1024l*1024l
+#define s2ns(s) ((s)*1000l*1000l*1000l)
+#define ns2us(ns) ((ns)/1000l)
+#define PAGED_FILE "/dev/nvme0n1"
+int max(int x, int y) {return x > y ? x : y;} 
+// Original function from copy_only.cu
+void fill_rand(char* buf, uint64_t buf_len) {
+        uint64_t i = 0;
+        for (; i < buf_len; i++)
+                buf[i] = max((rand() & 0xff), 1);
+}
+uint64_t count_zero(char* buf, uint64_t buf_len) {
+        uint64_t i = 0;
+        uint64_t num_zeros = 0;
+        for (; i < buf_len; i++)
+                num_zeros += (!buf[i]);
+        return num_zeros;
+}
+long time_diff_ns(struct timespec start, struct timespec stop) {
+        return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
+}
+int main(int argc, char **argv) {
+        struct timespec out_start, out_stop, in_start, in_stop;
+        int iters = 1;
+        int res;
+        if (argc > 1)
+                iters = atoi(argv[1]);
+        // Needed to allow page cache clearing between iterations
+        // Note: Shouldn't be needed with O_DIRECT, but include it just in case
+        int clear_fd = open("/proc/sys/vm/drop_caches", O_WRONLY);
+        if (clear_fd == -1) {
+                perror("Unable to open /proc/sys/vm/drop_caches");
+                return 1;
+        }
+        char clear_cmd = '3';
+        printf("out (us)\tin (us)\n");
+        for (int i = 0; i < iters; i++) {
+                char *mem_in, *mem_out;
+                int fd = open(PAGED_FILE, O_RDWR | O_DIRECT | O_SYNC);
+                if (fd == -1) {
+                        perror("Unable to open " PAGED_FILE);
+                        return 1;
+                }
+                // Clear page cache
+                write(clear_fd, &clear_cmd, 1);
+                // Allocate and fill a buffer with random data
+                // Aligned malloc(GiB) basicially
+                res = posix_memalign((void**)&mem_in, 4096, GiB);
+                fill_rand(mem_in, GiB);
+                // Write and free buffer
+                clock_gettime(CLOCK_MONOTONIC_RAW, &out_start);
+                res = write(fd, mem_in, GiB);
+                free(mem_in);
+                clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop);
+                if (res == -1) {
+                        perror("Unable to write 1GiB to " PAGED_FILE);
+                        return 1;
+                }
+                if (res != GiB) {
+                        fprintf(stderr, "Unable to write the buffer all at once!");
+                        return 2;
+                }
+                
+                sleep(1); // Supposedly some other work would happen here
+                write(clear_fd, &clear_cmd, 1); // Just in case O_DIRECT misbehaves
+                res = lseek(fd, 0, SEEK_SET); // Reposition offset
+                if (res == -1) {
+                        perror("Unable to seek to offset 0 in " PAGED_FILE);
+                        return 1;
+                }
+                if (res != 0) {
+                        fprintf(stderr, "Unable to seek to offset 0 in " PAGED_FILE);
+                        return 2;
+                }
+                
+                // Allocate and read buffer
+                clock_gettime(CLOCK_MONOTONIC_RAW, &in_start);
+                // Aligned malloc(GiB) basicially
+                res = posix_memalign((void**)&mem_out, 4096, GiB);
+                if (res) {
+                        fprintf(stderr, "posix_memalign() failure. Error %d.", res);
+                        return 1;
+                }
+                res = read(fd, mem_out, GiB);
+                clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop);
+                if (res == -1) {
+                        perror("Unable to read 1GiB from " PAGED_FILE);
+                        return 1;
+                }
+                if (res < GiB) {
+                        fprintf(stderr, "Unable to read the buffer all at once!");
+                        return 2;
+                }
+                // Check for valid contents
+                // TODO: Use CRC32 or something else a bit less dumb
+                res = count_zero(mem_out, GiB);
+                if (res > 0) {
+                        fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after I/O!\n", res);
+                        return 1;
+                }
+                
+                // Print results as tab-seperated-values
+                printf("%ld\t%ld\n", ns2us(time_diff_ns(out_start, out_stop)),
+                                     ns2us(time_diff_ns(in_start, in_stop)));
+                close(fd);
+                free(mem_out);
+        }
+        return 0;
+}
diff --git a/gpu_paging_evil_task.cu b/gpu_paging_evil_task.cu
new file mode 100644
index 0000000..7c1ab59
--- /dev/null
+++ b/gpu_paging_evil_task.cu
@@ -0,0 +1,138 @@
+#include <stdio.h>
+#include <cuda.h>
+#include <curand_kernel.h> // curandState_t and curand
+#include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h"
+#include <errno.h>
+#include <time.h> // clock_gettime
+#include <sys/ioctl.h> // ioctl
+#include <unistd.h> // sleep
+#define s2ns(s) ((s)*1000l*1000l*1000l)
+#define ns2us(ns) ((ns)/1000l)
+#define GiB 1024l*1024l*1024l
+// Originally from copy_testbed.h in the copy_experiments set
+#define SAFE(x) \
+        if ((err = (cudaError_t)(x)) != 0) { \
+                printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \
+                printf("Suspect line: %s\n", #x); \
+                exit(1); \
+        }
+// Fill buffer with random bytes. Supports buffers >4GiB.
+// Original function on CPU part of copy_only.cu
+// @param buf     Pointer to buffer
+// @param buf_len Length of buffer in bytes
+// @note Supports splitting the work across threads
+__device__ curandState_t rng_state;
+__global__ void fill_rand(char* buf, uint64_t buf_len) {
+        uint64_t to = buf_len;
+        uint64_t i = 0;
+        if (blockDim.x > 1) {
+                // Subdivide the work
+                uint64_t chunk_sz = buf_len/blockDim.x;
+                i = threadIdx.x * chunk_sz;
+                to = threadIdx.x * chunk_sz + chunk_sz;
+                // If buffer size doesn't evenly divide, make last thread get remaineder
+                if (threadIdx.x + 1 == blockDim.x) {
+                        to = buf_len;
+                }
+        }
+        for (; i < to; i++)
+                buf[i] = max((curand(&rng_state) & 0xff), 1);
+}
+// Fill buffer with sequential quadwords
+// @param buf     Pointer to buffer
+// @param buf_len Length of buffer in bytes
+__global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) {
+        uint64_t i;
+        for (i = 0; i < buf_len; i++)
+                buf[i] = start_num++;
+}
+__device__ uint64_t gpu_res;
+// Count number of zeros in a buffer
+// @param buf     Pointer to buffer
+// @param buf_len Length of buffer in bytes
+// @return via gpu_res Number of zeros found
+// @note Supports splitting the work across threads
+__global__ void count_zero(char* buf, uint64_t buf_len) {
+        gpu_res = 0;
+        uint64_t to = buf_len;
+        uint64_t i = 0;
+        if (blockDim.x > 1) {
+                // Subdivide the work
+                uint64_t chunk_sz = buf_len/blockDim.x;
+                i = threadIdx.x * chunk_sz;
+                to = threadIdx.x * chunk_sz + chunk_sz;
+                // If buffer size doesn't evenly divide, make last thread get remaineder
+                if (threadIdx.x + 1 == blockDim.x) {
+                        to = buf_len;
+                }
+        }
+        uint64_t num_zero;
+        for (; i < to; i++)
+                num_zero += (!buf[i]);
+        // Cast shouldn't strictly be needed, but won't build without...
+        atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero);
+}
+long time_diff_ns(struct timespec start, struct timespec stop) {
+        return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
+}
+int main(int argc, char **argv) {
+        char* gpu_buf;
+        struct timespec out_start, out_stop;
+        int res;
+        cudaStream_t stream1;
+        cudaError_t err;
+        int iters;
+        if (argc != 2) {
+                fprintf(stderr, "Usage: %s <iterations>\n", argv[0]);
+                return 1;
+        }
+        iters = atoi(argv[1]);
+        SAFE(cudaStreamCreate(&stream1));
+        SAFE(cudaMalloc(&gpu_buf, GiB));
+        // Fill buffer with data
+        fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB);
+        SAFE(cudaStreamSynchronize(stream1));
+        clock_gettime(CLOCK_MONOTONIC_RAW, &out_start);
+        for (int i = 0; i < iters; i++) {
+                // Copy out
+                struct nvgpu_as_swap_buffer_args ioctl_arg = {1160};
+                res = ioctl(6, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER, &ioctl_arg);
+                if (res) {
+                        perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER");
+                        return res;
+                }
+                // Copy in
+                res = ioctl(6, NVGPU_AS_IOCTL_READ_SWAP_BUFFER, &ioctl_arg);
+                if (res) {
+                        perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER");
+                        return res;
+                }
+        }
+        clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop);
+        // Check for valid contents
+        count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB);
+        SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost));
+        SAFE(cudaStreamSynchronize(stream1));
+        if (res > 0) {
+                fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res);
+                return 1;
+        }
+        long duration = ns2us(time_diff_ns(out_start, out_stop));
+        printf("Took %ldus to do %d paging loops (%.2f us per loop)\n", duration, iters, duration/(float)iters);
+        cudaFree(gpu_buf);
+}
diff --git a/gpu_paging_overhead_speed.cu b/gpu_paging_overhead_speed.cu
new file mode 100644
index 0000000..c7a0f3a
--- /dev/null
+++ b/gpu_paging_overhead_speed.cu
@@ -0,0 +1,168 @@
+#include <stdio.h>
+#include <cuda.h>
+#include <curand_kernel.h> // curandState_t and curand
+#include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h"
+#include <errno.h>
+#include <time.h> // clock_gettime
+#include <sys/ioctl.h> // ioctl
+#include <unistd.h> // sleep
+#define s2ns(s) ((s)*1000l*1000l*1000l)
+#define ns2us(ns) ((ns)/1000l)
+#define GiB 1024l*1024l*1024l
+// Originally from copy_testbed.h in the copy_experiments set
+#define SAFE(x) \
+        if ((err = (cudaError_t)(x)) != 0) { \
+                printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \
+                printf("Suspect line: %s\n", #x); \
+                exit(1); \
+        }
+// Fill buffer with random bytes. Supports buffers >4GiB.
+// Original function on CPU part of copy_only.cu
+// @param buf     Pointer to buffer
+// @param buf_len Length of buffer in bytes
+// @note Supports splitting the work across threads
+__device__ curandState_t rng_state;
+__global__ void fill_rand(char* buf, uint64_t buf_len) {
+        uint64_t to = buf_len;
+        uint64_t i = 0;
+        if (blockDim.x > 1) {
+                // Subdivide the work
+                uint64_t chunk_sz = buf_len/blockDim.x;
+                i = threadIdx.x * chunk_sz;
+                to = threadIdx.x * chunk_sz + chunk_sz;
+                // If buffer size doesn't evenly divide, make last thread get remaineder
+                if (threadIdx.x + 1 == blockDim.x) {
+                        to = buf_len;
+                }
+        }
+        for (; i < to; i++)
+                buf[i] = max((curand(&rng_state) & 0xff), 1);
+}
+// Fill buffer with sequential quadwords
+// @param buf     Pointer to buffer
+// @param buf_len Length of buffer in bytes
+__global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) {
+        uint64_t i;
+        for (i = 0; i < buf_len; i++)
+                buf[i] = start_num++;
+}
+__device__ uint64_t gpu_res;
+// Count number of zeros in a buffer
+// @param buf     Pointer to buffer
+// @param buf_len Length of buffer in bytes
+// @return via gpu_res Number of zeros found
+// @note Supports splitting the work across threads
+__global__ void count_zero(char* buf, uint64_t buf_len) {
+        gpu_res = 0;
+        uint64_t to = buf_len;
+        uint64_t i = 0;
+        if (blockDim.x > 1) {
+                // Subdivide the work
+                uint64_t chunk_sz = buf_len/blockDim.x;
+                i = threadIdx.x * chunk_sz;
+                to = threadIdx.x * chunk_sz + chunk_sz;
+                // If buffer size doesn't evenly divide, make last thread get remaineder
+                if (threadIdx.x + 1 == blockDim.x) {
+                        to = buf_len;
+                }
+        }
+        uint64_t num_zero;
+        for (; i < to; i++)
+                num_zero += (!buf[i]);
+        // Cast shouldn't strictly be needed, but won't build without...
+        atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero);
+}
+long time_diff_ns(struct timespec start, struct timespec stop) {
+        return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
+}
+#define AS_FD 6
+// Use 8 if running with perf
+//#define AS_FD 8
+int main(int argc, char **argv) {
+        char* gpu_buf;
+        struct timespec out_start, out_stop, in_start, in_stop;
+        struct timespec out_start2, out_stop2, in_start2, in_stop2;
+        int res;
+        cudaStream_t stream1;
+        cudaError_t err;
+        SAFE(cudaStreamCreate(&stream1));
+        SAFE(cudaMalloc(&gpu_buf, GiB));
+        // Fill buffer with data
+        fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB);
+        SAFE(cudaStreamSynchronize(stream1));
+        // Reset sector assignments (does not fail)
+        ioctl(AS_FD, NVGPU_AS_IOCTL_SWAP_RESET);
+        // Copy out
+        struct nvgpu_as_swap_buffer_args ioctl_arg = {1160};
+        //struct nvgpu_as_swap_buffer_args ioctl_arg = {NVGPU_SWAP_ALL};
+        clock_gettime(CLOCK_MONOTONIC_RAW, &out_start);
+        res = ioctl(AS_FD, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC, &ioctl_arg);
+        clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop);
+        if (res < 0) {
+                perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC");
+                return res;
+        }
+        //printf("Num failed: %d\n", res);
+        
+        sleep(1);
+        clock_gettime(CLOCK_MONOTONIC_RAW, &out_start2);
+        res = ioctl(AS_FD, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC_FINISH, &ioctl_arg);
+        clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop2);
+        if (res < 0) {
+                perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC_FINISH");
+                return res;
+        }
+        //printf("Num failed: %d\n", res);
+        sleep(1); // Supposedly some other work would happen here
+        // Copy in
+        clock_gettime(CLOCK_MONOTONIC_RAW, &in_start);
+        res = ioctl(AS_FD, NVGPU_AS_IOCTL_READ_SWAP_BUFFER_ASYNC, &ioctl_arg);
+        clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop);
+        if (res < 0) {
+                perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER_ASYNC");
+                return res;
+        }
+        //printf("Num failed: %d\n", res);
+        sleep(1);
+        clock_gettime(CLOCK_MONOTONIC_RAW, &in_start2);
+        res = ioctl(AS_FD, NVGPU_AS_IOCTL_READ_SWAP_BUFFER_ASYNC_FINISH, &ioctl_arg);
+        clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop2);
+        if (res < 0) {
+                perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER_ASYNC_FINISH");
+                return res;
+        }
+        //printf("Num failed: %d\n", res);
+        // Check for valid contents
+        count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB);
+        SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost));
+        SAFE(cudaStreamSynchronize(stream1));
+        if (res > 0) {
+                fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res);
+                return 1;
+        }
+        // Print results as tab-seperated-values
+        printf("out_start(us)\tout_fin(us)\tin_start(us)\tout_fin(us)\n");
+        printf("%ld\t%ld\t%ld\t%ld\n", ns2us(time_diff_ns(out_start, out_stop)),
+                             ns2us(time_diff_ns(out_start2, out_stop2)),
+                             ns2us(time_diff_ns(in_start, in_stop)),
+                             ns2us(time_diff_ns(in_start2, in_stop2)));
+        cudaFree(gpu_buf);
+}
diff --git a/gpu_paging_speed.cu b/gpu_paging_speed.cu
new file mode 100644
index 0000000..72cb82e
--- /dev/null
+++ b/gpu_paging_speed.cu
@@ -0,0 +1,136 @@
+#include <stdio.h>
+#include <cuda.h>
+#include <curand_kernel.h> // curandState_t and curand
+#include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h"
+#include <errno.h>
+#include <time.h> // clock_gettime
+#include <sys/ioctl.h> // ioctl
+#include <unistd.h> // sleep
+#define s2ns(s) ((s)*1000l*1000l*1000l)
+#define ns2us(ns) ((ns)/1000l)
+#define GiB 1024l*1024l*1024l
+// Originally from copy_testbed.h in the copy_experiments set
+#define SAFE(x) \
+        if ((err = (cudaError_t)(x)) != 0) { \
+                printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \
+                printf("Suspect line: %s\n", #x); \
+                exit(1); \
+        }
+// Fill buffer with random bytes. Supports buffers >4GiB.
+// Original function on CPU part of copy_only.cu
+// @param buf     Pointer to buffer
+// @param buf_len Length of buffer in bytes
+// @note Supports splitting the work across threads
+__device__ curandState_t rng_state;
+__global__ void fill_rand(char* buf, uint64_t buf_len) {
+        uint64_t to = buf_len;
+        uint64_t i = 0;
+        if (blockDim.x > 1) {
+                // Subdivide the work
+                uint64_t chunk_sz = buf_len/blockDim.x;
+                i = threadIdx.x * chunk_sz;
+                to = threadIdx.x * chunk_sz + chunk_sz;
+                // If buffer size doesn't evenly divide, make last thread get remaineder
+                if (threadIdx.x + 1 == blockDim.x) {
+                        to = buf_len;
+                }
+        }
+        for (; i < to; i++)
+                buf[i] = max((curand(&rng_state) & 0xff), 1);
+}
+// Fill buffer with sequential quadwords
+// @param buf     Pointer to buffer
+// @param buf_len Length of buffer in bytes
+__global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) {
+        uint64_t i;
+        for (i = 0; i < buf_len; i++)
+                buf[i] = start_num++;
+}
+__device__ uint64_t gpu_res;
+// Count number of zeros in a buffer
+// @param buf     Pointer to buffer
+// @param buf_len Length of buffer in bytes
+// @return via gpu_res Number of zeros found
+// @note Supports splitting the work across threads
+__global__ void count_zero(char* buf, uint64_t buf_len) {
+        gpu_res = 0;
+        uint64_t to = buf_len;
+        uint64_t i = 0;
+        if (blockDim.x > 1) {
+                // Subdivide the work
+                uint64_t chunk_sz = buf_len/blockDim.x;
+                i = threadIdx.x * chunk_sz;
+                to = threadIdx.x * chunk_sz + chunk_sz;
+                // If buffer size doesn't evenly divide, make last thread get remaineder
+                if (threadIdx.x + 1 == blockDim.x) {
+                        to = buf_len;
+                }
+        }
+        uint64_t num_zero;
+        for (; i < to; i++)
+                num_zero += (!buf[i]);
+        // Cast shouldn't strictly be needed, but won't build without...
+        atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero);
+}
+long time_diff_ns(struct timespec start, struct timespec stop) {
+        return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
+}
+int main(int argc, char **argv) {
+        char* gpu_buf;
+        struct timespec out_start, out_stop, in_start, in_stop;
+        int res;
+        cudaStream_t stream1;
+        cudaError_t err;
+        SAFE(cudaStreamCreate(&stream1));
+        SAFE(cudaMalloc(&gpu_buf, GiB));
+        // Fill buffer with data
+        fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB);
+        SAFE(cudaStreamSynchronize(stream1));
+        // Reset sector assignments (does not fail)
+        ioctl(6, NVGPU_AS_IOCTL_SWAP_RESET);
+        // Copy out
+        struct nvgpu_as_swap_buffer_args ioctl_arg = {1160};
+        clock_gettime(CLOCK_MONOTONIC_RAW, &out_start);
+        res = ioctl(6, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER, &ioctl_arg);
+        clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop);
+        if (res) {
+                perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER");
+                return res;
+        }
+        sleep(1); // Supposedly some other work would happen here
+        // Copy in
+        clock_gettime(CLOCK_MONOTONIC_RAW, &in_start);
+        res = ioctl(6, NVGPU_AS_IOCTL_READ_SWAP_BUFFER, &ioctl_arg);
+        clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop);
+        if (res) {
+                perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER");
+                return res;
+        }
+        // Check for valid contents
+        count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB);
+        SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost));
+        SAFE(cudaStreamSynchronize(stream1));
+        if (res > 0) {
+                fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res);
+                return 1;
+        }
+        // Print results as tab-seperated-values
+        printf("out (us)\tin (us)\n");
+        printf("%ld\t%ld\n", ns2us(time_diff_ns(out_start, out_stop)),
+                             ns2us(time_diff_ns(in_start, in_stop)));
+        cudaFree(gpu_buf);
+}
diff --git a/paging_speed.c b/paging_speed.c
new file mode 100644
index 0000000..4ad56e2
--- /dev/null
+++ b/paging_speed.c
@@ -0,0 +1,137 @@
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <time.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h> // strlen()
+#define GiB 1024l*1024l*1024l
+#define s2ns(s) ((s)*1000l*1000l*1000l)
+int seq_walk(char* mem, int len, char to_find) {
+        int num_42 = 0;
+        // Stride of 4096 bytes (one 4k page)
+        for (int i = 4096; i < len; i += 4096)
+                if (mem[i] == to_find)
+                        num_42++;
+        return num_42;
+}
+long time_diff_ns(struct timespec start, struct timespec stop) {
+        return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
+}
+// TODO: take *num_42, return time
+//#define PAGED_FILE "/home/jbakita/1gib_random_f"
+#define PAGED_FILE "/dev/nvme0n1"
+int main(int argc, char **argv) {
+        int iters = 1;
+        int no_seq = 0;
+        if (argc > 1)
+                iters = atoi(argv[1]);
+        if (argc > 2) {
+                no_seq = strncmp(argv[2], "--no-seq", strlen(argv[2])) ? 1 : 0;
+                fprintf(stderr, "Skipping seq, but using no-seq emulation with demand paging\n");
+        }
+        struct timespec start, stop, seq_stop;
+        int clear_fd = open("/proc/sys/vm/drop_caches", O_WRONLY);
+        if (clear_fd == -1) {
+                perror("Unable to open /proc/sys/vm/drop_caches");
+                return 1;
+        }
+        char clear_cmd = '3';
+        for (int i = 0; i < iters; i++) {
+                int fd = open(PAGED_FILE, O_RDWR);
+                if (fd == -1) {
+                        perror("Unable to open " PAGED_FILE);
+                        return 1;
+                }
+                // Clear page cache
+                write(clear_fd, &clear_cmd, 1);
+                // VIA MMAP
+                clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+                char* mem = mmap(NULL, GiB, PROT_READ, MAP_PRIVATE, fd, 0);
+                if (mem == MAP_FAILED) {
+                        perror("Unable to mmap " PAGED_FILE);
+                        return 1;
+                }
+                // Fault on all the pages via a sequential walk
+                int num_42 = seq_walk(mem, GiB, 42);
+                clock_gettime(CLOCK_MONOTONIC_RAW, &stop);
+                int num_52 = 0;
+                if (no_seq)
+                        num_52 = seq_walk(mem, GiB, 52);
+                clock_gettime(CLOCK_MONOTONIC_RAW, &seq_stop);
+                if (num_52)
+                        fprintf(stderr, "Something is seriously wrong! Found a 52 in a buffer that should be only 42s\n");
+                long duration = (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
+                // Emulate the time demand paging would take if we didn't have to walk
+                if (no_seq) {
+                        long seq_time = time_diff_ns(stop, seq_stop);
+                        duration -= seq_time;
+                }
+                if (iters == 1) {
+                        printf("Took %ldus via mmap\n", duration / 1000);
+                        printf("Read %d 42s of %ld expected\n", num_42, GiB/4096);
+                } else {
+                        printf("%ld, ", duration / 1000);
+                }
+                munmap(mem, GiB);
+                close(fd);
+        }
+        if (iters > 1)
+                printf("\n");
+        for (int i = 0; i < iters; i++) {
+                char* mem;
+                int fd = open(PAGED_FILE, O_RDWR | O_DIRECT);
+                if (fd == -1) {
+                        perror("Unable to open " PAGED_FILE);
+                        return 1;
+                }
+                // Clear page cache
+                write(clear_fd, &clear_cmd, 1);
+                // VIA READ
+                clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+                // Aligned malloc(GiB) basicially
+                int res = posix_memalign((void**)&mem, 4096, GiB);
+                if (res) {
+                        fprintf(stderr, "posix_memalign() failure. Error %d.", res);
+                        return 1;
+                }
+                res = read(fd, mem, GiB);
+                if (res == -1) {
+                        perror("Unable to read 1GiB from /dev/nvme0n1");
+                        return 1;
+                }
+                if (res < GiB) {
+                        fprintf(stderr, "Unable to read the buffer all at once!");
+                        return 2;
+                }
+                int num_42 = 0;
+                if (!no_seq)
+                        num_42 = seq_walk(mem, GiB, 42); // Not strictly necessary, but to match mmap path overheads
+                clock_gettime(CLOCK_MONOTONIC_RAW, &stop);
+                if (iters == 1) {
+                        printf("Took %ldus via read\n", ((s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec)) / 1000);
+                        if (!no_seq)
+                                printf("Read %d 42s of %ld expected\n", num_42, GiB/4096);
+                } else {
+                        printf("%ld, ", ((s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec)) / 1000);
+                }
+                close(fd);
+                free(mem);
+        }
+        if (iters > 1)
+                printf("\n");
+        return 0;
+}
author	Joshua Bakita <jbakita@cs.unc.edu>	2022-09-12 10:47:56 -0400
committer	Joshua Bakita <jbakita@cs.unc.edu>	2022-09-12 10:47:56 -0400
commit	a6286e09f4a3c78522a12b3d55b53ef1245bf558 (patch)
tree	1f20908883b3c4989d51bc66b655bfe258cba15d