summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <jbakita@cs.unc.edu>2022-09-12 10:47:56 -0400
committerJoshua Bakita <jbakita@cs.unc.edu>2022-09-12 10:47:56 -0400
commita6286e09f4a3c78522a12b3d55b53ef1245bf558 (patch)
tree1f20908883b3c4989d51bc66b655bfe258cba15d
Initial commit of tools as used for submission.
-rw-r--r--bomb.c9
-rw-r--r--directio_paging_speed.c129
-rw-r--r--gpu_paging_evil_task.cu138
-rw-r--r--gpu_paging_overhead_speed.cu168
-rw-r--r--gpu_paging_speed.cu136
-rw-r--r--paging_speed.c137
6 files changed, 717 insertions, 0 deletions
diff --git a/bomb.c b/bomb.c
new file mode 100644
index 0000000..791e4f9
--- /dev/null
+++ b/bomb.c
@@ -0,0 +1,9 @@
1#include <stdint.h>
2#include <stdlib.h>
3
4#define SZ 1024ull*1024ull*1024ull*16ull
5int main() {
6 char* ptr = malloc(SZ);
7 for (uint64_t i = 0; i < SZ; i+=4096)
8 ptr[i] = i;
9}
diff --git a/directio_paging_speed.c b/directio_paging_speed.c
new file mode 100644
index 0000000..b0a01d3
--- /dev/null
+++ b/directio_paging_speed.c
@@ -0,0 +1,129 @@
1#define _GNU_SOURCE
2
3#include <sys/mman.h>
4#include <sys/types.h>
5#include <sys/stat.h>
6#include <fcntl.h>
7#include <stdio.h>
8#include <stdint.h>
9#include <time.h>
10#include <unistd.h>
11#include <stdlib.h>
12
13#define GiB 1024l*1024l*1024l
14#define s2ns(s) ((s)*1000l*1000l*1000l)
15#define ns2us(ns) ((ns)/1000l)
16#define PAGED_FILE "/dev/nvme0n1"
17int max(int x, int y) {return x > y ? x : y;}
18
19// Original function from copy_only.cu
20void fill_rand(char* buf, uint64_t buf_len) {
21 uint64_t i = 0;
22 for (; i < buf_len; i++)
23 buf[i] = max((rand() & 0xff), 1);
24}
25
26uint64_t count_zero(char* buf, uint64_t buf_len) {
27 uint64_t i = 0;
28 uint64_t num_zeros = 0;
29 for (; i < buf_len; i++)
30 num_zeros += (!buf[i]);
31 return num_zeros;
32}
33
34long time_diff_ns(struct timespec start, struct timespec stop) {
35 return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
36}
37
38int main(int argc, char **argv) {
39 struct timespec out_start, out_stop, in_start, in_stop;
40 int iters = 1;
41 int res;
42 if (argc > 1)
43 iters = atoi(argv[1]);
44
45 // Needed to allow page cache clearing between iterations
46 // Note: Shouldn't be needed with O_DIRECT, but include it just in case
47 int clear_fd = open("/proc/sys/vm/drop_caches", O_WRONLY);
48 if (clear_fd == -1) {
49 perror("Unable to open /proc/sys/vm/drop_caches");
50 return 1;
51 }
52 char clear_cmd = '3';
53
54 printf("out (us)\tin (us)\n");
55 for (int i = 0; i < iters; i++) {
56 char *mem_in, *mem_out;
57 int fd = open(PAGED_FILE, O_RDWR | O_DIRECT | O_SYNC);
58 if (fd == -1) {
59 perror("Unable to open " PAGED_FILE);
60 return 1;
61 }
62 // Clear page cache
63 write(clear_fd, &clear_cmd, 1);
64 // Allocate and fill a buffer with random data
65 // Aligned malloc(GiB) basicially
66 res = posix_memalign((void**)&mem_in, 4096, GiB);
67 fill_rand(mem_in, GiB);
68
69 // Write and free buffer
70 clock_gettime(CLOCK_MONOTONIC_RAW, &out_start);
71 res = write(fd, mem_in, GiB);
72 free(mem_in);
73 clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop);
74 if (res == -1) {
75 perror("Unable to write 1GiB to " PAGED_FILE);
76 return 1;
77 }
78 if (res != GiB) {
79 fprintf(stderr, "Unable to write the buffer all at once!");
80 return 2;
81 }
82
83 sleep(1); // Supposedly some other work would happen here
84 write(clear_fd, &clear_cmd, 1); // Just in case O_DIRECT misbehaves
85 res = lseek(fd, 0, SEEK_SET); // Reposition offset
86 if (res == -1) {
87 perror("Unable to seek to offset 0 in " PAGED_FILE);
88 return 1;
89 }
90 if (res != 0) {
91 fprintf(stderr, "Unable to seek to offset 0 in " PAGED_FILE);
92 return 2;
93 }
94
95 // Allocate and read buffer
96 clock_gettime(CLOCK_MONOTONIC_RAW, &in_start);
97 // Aligned malloc(GiB) basicially
98 res = posix_memalign((void**)&mem_out, 4096, GiB);
99 if (res) {
100 fprintf(stderr, "posix_memalign() failure. Error %d.", res);
101 return 1;
102 }
103 res = read(fd, mem_out, GiB);
104 clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop);
105 if (res == -1) {
106 perror("Unable to read 1GiB from " PAGED_FILE);
107 return 1;
108 }
109 if (res < GiB) {
110 fprintf(stderr, "Unable to read the buffer all at once!");
111 return 2;
112 }
113
114 // Check for valid contents
115 // TODO: Use CRC32 or something else a bit less dumb
116 res = count_zero(mem_out, GiB);
117 if (res > 0) {
118 fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after I/O!\n", res);
119 return 1;
120 }
121
122 // Print results as tab-seperated-values
123 printf("%ld\t%ld\n", ns2us(time_diff_ns(out_start, out_stop)),
124 ns2us(time_diff_ns(in_start, in_stop)));
125 close(fd);
126 free(mem_out);
127 }
128 return 0;
129}
diff --git a/gpu_paging_evil_task.cu b/gpu_paging_evil_task.cu
new file mode 100644
index 0000000..7c1ab59
--- /dev/null
+++ b/gpu_paging_evil_task.cu
@@ -0,0 +1,138 @@
1#include <stdio.h>
2#include <cuda.h>
3#include <curand_kernel.h> // curandState_t and curand
4#include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h"
5#include <errno.h>
6#include <time.h> // clock_gettime
7#include <sys/ioctl.h> // ioctl
8#include <unistd.h> // sleep
9
10#define s2ns(s) ((s)*1000l*1000l*1000l)
11#define ns2us(ns) ((ns)/1000l)
12#define GiB 1024l*1024l*1024l
13
14// Originally from copy_testbed.h in the copy_experiments set
15#define SAFE(x) \
16 if ((err = (cudaError_t)(x)) != 0) { \
17 printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \
18 printf("Suspect line: %s\n", #x); \
19 exit(1); \
20 }
21
22// Fill buffer with random bytes. Supports buffers >4GiB.
23// Original function on CPU part of copy_only.cu
24// @param buf Pointer to buffer
25// @param buf_len Length of buffer in bytes
26// @note Supports splitting the work across threads
27__device__ curandState_t rng_state;
28__global__ void fill_rand(char* buf, uint64_t buf_len) {
29 uint64_t to = buf_len;
30 uint64_t i = 0;
31 if (blockDim.x > 1) {
32 // Subdivide the work
33 uint64_t chunk_sz = buf_len/blockDim.x;
34 i = threadIdx.x * chunk_sz;
35 to = threadIdx.x * chunk_sz + chunk_sz;
36 // If buffer size doesn't evenly divide, make last thread get remaineder
37 if (threadIdx.x + 1 == blockDim.x) {
38 to = buf_len;
39 }
40 }
41 for (; i < to; i++)
42 buf[i] = max((curand(&rng_state) & 0xff), 1);
43}
44
45// Fill buffer with sequential quadwords
46// @param buf Pointer to buffer
47// @param buf_len Length of buffer in bytes
48__global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) {
49 uint64_t i;
50 for (i = 0; i < buf_len; i++)
51 buf[i] = start_num++;
52}
53
54__device__ uint64_t gpu_res;
55// Count number of zeros in a buffer
56// @param buf Pointer to buffer
57// @param buf_len Length of buffer in bytes
58// @return via gpu_res Number of zeros found
59// @note Supports splitting the work across threads
60__global__ void count_zero(char* buf, uint64_t buf_len) {
61 gpu_res = 0;
62 uint64_t to = buf_len;
63 uint64_t i = 0;
64 if (blockDim.x > 1) {
65 // Subdivide the work
66 uint64_t chunk_sz = buf_len/blockDim.x;
67 i = threadIdx.x * chunk_sz;
68 to = threadIdx.x * chunk_sz + chunk_sz;
69 // If buffer size doesn't evenly divide, make last thread get remaineder
70 if (threadIdx.x + 1 == blockDim.x) {
71 to = buf_len;
72 }
73 }
74 uint64_t num_zero;
75 for (; i < to; i++)
76 num_zero += (!buf[i]);
77 // Cast shouldn't strictly be needed, but won't build without...
78 atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero);
79}
80
81long time_diff_ns(struct timespec start, struct timespec stop) {
82 return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
83}
84
85int main(int argc, char **argv) {
86 char* gpu_buf;
87 struct timespec out_start, out_stop;
88 int res;
89 cudaStream_t stream1;
90 cudaError_t err;
91 int iters;
92 if (argc != 2) {
93 fprintf(stderr, "Usage: %s <iterations>\n", argv[0]);
94 return 1;
95 }
96 iters = atoi(argv[1]);
97 SAFE(cudaStreamCreate(&stream1));
98 SAFE(cudaMalloc(&gpu_buf, GiB));
99
100 // Fill buffer with data
101 fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB);
102 SAFE(cudaStreamSynchronize(stream1));
103
104 clock_gettime(CLOCK_MONOTONIC_RAW, &out_start);
105
106 for (int i = 0; i < iters; i++) {
107 // Copy out
108 struct nvgpu_as_swap_buffer_args ioctl_arg = {1160};
109 res = ioctl(6, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER, &ioctl_arg);
110 if (res) {
111 perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER");
112 return res;
113 }
114
115 // Copy in
116 res = ioctl(6, NVGPU_AS_IOCTL_READ_SWAP_BUFFER, &ioctl_arg);
117 if (res) {
118 perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER");
119 return res;
120 }
121 }
122
123 clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop);
124
125 // Check for valid contents
126 count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB);
127 SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost));
128 SAFE(cudaStreamSynchronize(stream1));
129 if (res > 0) {
130 fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res);
131 return 1;
132 }
133
134 long duration = ns2us(time_diff_ns(out_start, out_stop));
135 printf("Took %ldus to do %d paging loops (%.2f us per loop)\n", duration, iters, duration/(float)iters);
136
137 cudaFree(gpu_buf);
138}
diff --git a/gpu_paging_overhead_speed.cu b/gpu_paging_overhead_speed.cu
new file mode 100644
index 0000000..c7a0f3a
--- /dev/null
+++ b/gpu_paging_overhead_speed.cu
@@ -0,0 +1,168 @@
1#include <stdio.h>
2#include <cuda.h>
3#include <curand_kernel.h> // curandState_t and curand
4#include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h"
5#include <errno.h>
6#include <time.h> // clock_gettime
7#include <sys/ioctl.h> // ioctl
8#include <unistd.h> // sleep
9
10#define s2ns(s) ((s)*1000l*1000l*1000l)
11#define ns2us(ns) ((ns)/1000l)
12#define GiB 1024l*1024l*1024l
13
14// Originally from copy_testbed.h in the copy_experiments set
15#define SAFE(x) \
16 if ((err = (cudaError_t)(x)) != 0) { \
17 printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \
18 printf("Suspect line: %s\n", #x); \
19 exit(1); \
20 }
21
22// Fill buffer with random bytes. Supports buffers >4GiB.
23// Original function on CPU part of copy_only.cu
24// @param buf Pointer to buffer
25// @param buf_len Length of buffer in bytes
26// @note Supports splitting the work across threads
27__device__ curandState_t rng_state;
28__global__ void fill_rand(char* buf, uint64_t buf_len) {
29 uint64_t to = buf_len;
30 uint64_t i = 0;
31 if (blockDim.x > 1) {
32 // Subdivide the work
33 uint64_t chunk_sz = buf_len/blockDim.x;
34 i = threadIdx.x * chunk_sz;
35 to = threadIdx.x * chunk_sz + chunk_sz;
36 // If buffer size doesn't evenly divide, make last thread get remaineder
37 if (threadIdx.x + 1 == blockDim.x) {
38 to = buf_len;
39 }
40 }
41 for (; i < to; i++)
42 buf[i] = max((curand(&rng_state) & 0xff), 1);
43}
44
45// Fill buffer with sequential quadwords
46// @param buf Pointer to buffer
47// @param buf_len Length of buffer in bytes
48__global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) {
49 uint64_t i;
50 for (i = 0; i < buf_len; i++)
51 buf[i] = start_num++;
52}
53
54__device__ uint64_t gpu_res;
55// Count number of zeros in a buffer
56// @param buf Pointer to buffer
57// @param buf_len Length of buffer in bytes
58// @return via gpu_res Number of zeros found
59// @note Supports splitting the work across threads
60__global__ void count_zero(char* buf, uint64_t buf_len) {
61 gpu_res = 0;
62 uint64_t to = buf_len;
63 uint64_t i = 0;
64 if (blockDim.x > 1) {
65 // Subdivide the work
66 uint64_t chunk_sz = buf_len/blockDim.x;
67 i = threadIdx.x * chunk_sz;
68 to = threadIdx.x * chunk_sz + chunk_sz;
69 // If buffer size doesn't evenly divide, make last thread get remaineder
70 if (threadIdx.x + 1 == blockDim.x) {
71 to = buf_len;
72 }
73 }
74 uint64_t num_zero;
75 for (; i < to; i++)
76 num_zero += (!buf[i]);
77 // Cast shouldn't strictly be needed, but won't build without...
78 atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero);
79}
80
81long time_diff_ns(struct timespec start, struct timespec stop) {
82 return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
83}
84
85#define AS_FD 6
86// Use 8 if running with perf
87//#define AS_FD 8
88
89int main(int argc, char **argv) {
90 char* gpu_buf;
91 struct timespec out_start, out_stop, in_start, in_stop;
92 struct timespec out_start2, out_stop2, in_start2, in_stop2;
93 int res;
94 cudaStream_t stream1;
95 cudaError_t err;
96 SAFE(cudaStreamCreate(&stream1));
97 SAFE(cudaMalloc(&gpu_buf, GiB));
98
99 // Fill buffer with data
100 fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB);
101 SAFE(cudaStreamSynchronize(stream1));
102
103 // Reset sector assignments (does not fail)
104 ioctl(AS_FD, NVGPU_AS_IOCTL_SWAP_RESET);
105
106 // Copy out
107 struct nvgpu_as_swap_buffer_args ioctl_arg = {1160};
108 //struct nvgpu_as_swap_buffer_args ioctl_arg = {NVGPU_SWAP_ALL};
109 clock_gettime(CLOCK_MONOTONIC_RAW, &out_start);
110 res = ioctl(AS_FD, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC, &ioctl_arg);
111 clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop);
112 if (res < 0) {
113 perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC");
114 return res;
115 }
116 //printf("Num failed: %d\n", res);
117
118 sleep(1);
119
120 clock_gettime(CLOCK_MONOTONIC_RAW, &out_start2);
121 res = ioctl(AS_FD, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC_FINISH, &ioctl_arg);
122 clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop2);
123 if (res < 0) {
124 perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC_FINISH");
125 return res;
126 }
127 //printf("Num failed: %d\n", res);
128
129 sleep(1); // Supposedly some other work would happen here
130
131 // Copy in
132 clock_gettime(CLOCK_MONOTONIC_RAW, &in_start);
133 res = ioctl(AS_FD, NVGPU_AS_IOCTL_READ_SWAP_BUFFER_ASYNC, &ioctl_arg);
134 clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop);
135 if (res < 0) {
136 perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER_ASYNC");
137 return res;
138 }
139 //printf("Num failed: %d\n", res);
140
141 sleep(1);
142
143 clock_gettime(CLOCK_MONOTONIC_RAW, &in_start2);
144 res = ioctl(AS_FD, NVGPU_AS_IOCTL_READ_SWAP_BUFFER_ASYNC_FINISH, &ioctl_arg);
145 clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop2);
146 if (res < 0) {
147 perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER_ASYNC_FINISH");
148 return res;
149 }
150 //printf("Num failed: %d\n", res);
151
152 // Check for valid contents
153 count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB);
154 SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost));
155 SAFE(cudaStreamSynchronize(stream1));
156 if (res > 0) {
157 fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res);
158 return 1;
159 }
160
161 // Print results as tab-seperated-values
162 printf("out_start(us)\tout_fin(us)\tin_start(us)\tout_fin(us)\n");
163 printf("%ld\t%ld\t%ld\t%ld\n", ns2us(time_diff_ns(out_start, out_stop)),
164 ns2us(time_diff_ns(out_start2, out_stop2)),
165 ns2us(time_diff_ns(in_start, in_stop)),
166 ns2us(time_diff_ns(in_start2, in_stop2)));
167 cudaFree(gpu_buf);
168}
diff --git a/gpu_paging_speed.cu b/gpu_paging_speed.cu
new file mode 100644
index 0000000..72cb82e
--- /dev/null
+++ b/gpu_paging_speed.cu
@@ -0,0 +1,136 @@
1#include <stdio.h>
2#include <cuda.h>
3#include <curand_kernel.h> // curandState_t and curand
4#include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h"
5#include <errno.h>
6#include <time.h> // clock_gettime
7#include <sys/ioctl.h> // ioctl
8#include <unistd.h> // sleep
9
10#define s2ns(s) ((s)*1000l*1000l*1000l)
11#define ns2us(ns) ((ns)/1000l)
12#define GiB 1024l*1024l*1024l
13
14// Originally from copy_testbed.h in the copy_experiments set
15#define SAFE(x) \
16 if ((err = (cudaError_t)(x)) != 0) { \
17 printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \
18 printf("Suspect line: %s\n", #x); \
19 exit(1); \
20 }
21
22// Fill buffer with random bytes. Supports buffers >4GiB.
23// Original function on CPU part of copy_only.cu
24// @param buf Pointer to buffer
25// @param buf_len Length of buffer in bytes
26// @note Supports splitting the work across threads
27__device__ curandState_t rng_state;
28__global__ void fill_rand(char* buf, uint64_t buf_len) {
29 uint64_t to = buf_len;
30 uint64_t i = 0;
31 if (blockDim.x > 1) {
32 // Subdivide the work
33 uint64_t chunk_sz = buf_len/blockDim.x;
34 i = threadIdx.x * chunk_sz;
35 to = threadIdx.x * chunk_sz + chunk_sz;
36 // If buffer size doesn't evenly divide, make last thread get remaineder
37 if (threadIdx.x + 1 == blockDim.x) {
38 to = buf_len;
39 }
40 }
41 for (; i < to; i++)
42 buf[i] = max((curand(&rng_state) & 0xff), 1);
43}
44
45// Fill buffer with sequential quadwords
46// @param buf Pointer to buffer
47// @param buf_len Length of buffer in bytes
48__global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) {
49 uint64_t i;
50 for (i = 0; i < buf_len; i++)
51 buf[i] = start_num++;
52}
53
54__device__ uint64_t gpu_res;
55// Count number of zeros in a buffer
56// @param buf Pointer to buffer
57// @param buf_len Length of buffer in bytes
58// @return via gpu_res Number of zeros found
59// @note Supports splitting the work across threads
60__global__ void count_zero(char* buf, uint64_t buf_len) {
61 gpu_res = 0;
62 uint64_t to = buf_len;
63 uint64_t i = 0;
64 if (blockDim.x > 1) {
65 // Subdivide the work
66 uint64_t chunk_sz = buf_len/blockDim.x;
67 i = threadIdx.x * chunk_sz;
68 to = threadIdx.x * chunk_sz + chunk_sz;
69 // If buffer size doesn't evenly divide, make last thread get remaineder
70 if (threadIdx.x + 1 == blockDim.x) {
71 to = buf_len;
72 }
73 }
74 uint64_t num_zero;
75 for (; i < to; i++)
76 num_zero += (!buf[i]);
77 // Cast shouldn't strictly be needed, but won't build without...
78 atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero);
79}
80
81long time_diff_ns(struct timespec start, struct timespec stop) {
82 return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
83}
84
85int main(int argc, char **argv) {
86 char* gpu_buf;
87 struct timespec out_start, out_stop, in_start, in_stop;
88 int res;
89 cudaStream_t stream1;
90 cudaError_t err;
91 SAFE(cudaStreamCreate(&stream1));
92 SAFE(cudaMalloc(&gpu_buf, GiB));
93
94 // Fill buffer with data
95 fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB);
96 SAFE(cudaStreamSynchronize(stream1));
97
98 // Reset sector assignments (does not fail)
99 ioctl(6, NVGPU_AS_IOCTL_SWAP_RESET);
100
101 // Copy out
102 struct nvgpu_as_swap_buffer_args ioctl_arg = {1160};
103 clock_gettime(CLOCK_MONOTONIC_RAW, &out_start);
104 res = ioctl(6, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER, &ioctl_arg);
105 clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop);
106 if (res) {
107 perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER");
108 return res;
109 }
110
111 sleep(1); // Supposedly some other work would happen here
112
113 // Copy in
114 clock_gettime(CLOCK_MONOTONIC_RAW, &in_start);
115 res = ioctl(6, NVGPU_AS_IOCTL_READ_SWAP_BUFFER, &ioctl_arg);
116 clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop);
117 if (res) {
118 perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER");
119 return res;
120 }
121
122 // Check for valid contents
123 count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB);
124 SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost));
125 SAFE(cudaStreamSynchronize(stream1));
126 if (res > 0) {
127 fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res);
128 return 1;
129 }
130
131 // Print results as tab-seperated-values
132 printf("out (us)\tin (us)\n");
133 printf("%ld\t%ld\n", ns2us(time_diff_ns(out_start, out_stop)),
134 ns2us(time_diff_ns(in_start, in_stop)));
135 cudaFree(gpu_buf);
136}
diff --git a/paging_speed.c b/paging_speed.c
new file mode 100644
index 0000000..4ad56e2
--- /dev/null
+++ b/paging_speed.c
@@ -0,0 +1,137 @@
1#define _GNU_SOURCE
2
3#include <sys/mman.h>
4#include <sys/types.h>
5#include <sys/stat.h>
6#include <fcntl.h>
7#include <stdio.h>
8#include <stdint.h>
9#include <time.h>
10#include <unistd.h>
11#include <stdlib.h>
12#include <string.h> // strlen()
13
14#define GiB 1024l*1024l*1024l
15#define s2ns(s) ((s)*1000l*1000l*1000l)
16
17
18int seq_walk(char* mem, int len, char to_find) {
19 int num_42 = 0;
20 // Stride of 4096 bytes (one 4k page)
21 for (int i = 4096; i < len; i += 4096)
22 if (mem[i] == to_find)
23 num_42++;
24 return num_42;
25}
26
27long time_diff_ns(struct timespec start, struct timespec stop) {
28 return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
29}
30// TODO: take *num_42, return time
31
32//#define PAGED_FILE "/home/jbakita/1gib_random_f"
33#define PAGED_FILE "/dev/nvme0n1"
34
35int main(int argc, char **argv) {
36 int iters = 1;
37 int no_seq = 0;
38 if (argc > 1)
39 iters = atoi(argv[1]);
40 if (argc > 2) {
41 no_seq = strncmp(argv[2], "--no-seq", strlen(argv[2])) ? 1 : 0;
42 fprintf(stderr, "Skipping seq, but using no-seq emulation with demand paging\n");
43 }
44 struct timespec start, stop, seq_stop;
45 int clear_fd = open("/proc/sys/vm/drop_caches", O_WRONLY);
46 if (clear_fd == -1) {
47 perror("Unable to open /proc/sys/vm/drop_caches");
48 return 1;
49 }
50
51 char clear_cmd = '3';
52 for (int i = 0; i < iters; i++) {
53 int fd = open(PAGED_FILE, O_RDWR);
54 if (fd == -1) {
55 perror("Unable to open " PAGED_FILE);
56 return 1;
57 }
58 // Clear page cache
59 write(clear_fd, &clear_cmd, 1);
60 // VIA MMAP
61 clock_gettime(CLOCK_MONOTONIC_RAW, &start);
62 char* mem = mmap(NULL, GiB, PROT_READ, MAP_PRIVATE, fd, 0);
63 if (mem == MAP_FAILED) {
64 perror("Unable to mmap " PAGED_FILE);
65 return 1;
66 }
67 // Fault on all the pages via a sequential walk
68 int num_42 = seq_walk(mem, GiB, 42);
69 clock_gettime(CLOCK_MONOTONIC_RAW, &stop);
70 int num_52 = 0;
71 if (no_seq)
72 num_52 = seq_walk(mem, GiB, 52);
73 clock_gettime(CLOCK_MONOTONIC_RAW, &seq_stop);
74 if (num_52)
75 fprintf(stderr, "Something is seriously wrong! Found a 52 in a buffer that should be only 42s\n");
76 long duration = (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
77 // Emulate the time demand paging would take if we didn't have to walk
78 if (no_seq) {
79 long seq_time = time_diff_ns(stop, seq_stop);
80 duration -= seq_time;
81 }
82 if (iters == 1) {
83 printf("Took %ldus via mmap\n", duration / 1000);
84 printf("Read %d 42s of %ld expected\n", num_42, GiB/4096);
85 } else {
86 printf("%ld, ", duration / 1000);
87 }
88 munmap(mem, GiB);
89 close(fd);
90 }
91 if (iters > 1)
92 printf("\n");
93
94 for (int i = 0; i < iters; i++) {
95 char* mem;
96 int fd = open(PAGED_FILE, O_RDWR | O_DIRECT);
97 if (fd == -1) {
98 perror("Unable to open " PAGED_FILE);
99 return 1;
100 }
101 // Clear page cache
102 write(clear_fd, &clear_cmd, 1);
103 // VIA READ
104 clock_gettime(CLOCK_MONOTONIC_RAW, &start);
105 // Aligned malloc(GiB) basicially
106 int res = posix_memalign((void**)&mem, 4096, GiB);
107 if (res) {
108 fprintf(stderr, "posix_memalign() failure. Error %d.", res);
109 return 1;
110 }
111 res = read(fd, mem, GiB);
112 if (res == -1) {
113 perror("Unable to read 1GiB from /dev/nvme0n1");
114 return 1;
115 }
116 if (res < GiB) {
117 fprintf(stderr, "Unable to read the buffer all at once!");
118 return 2;
119 }
120 int num_42 = 0;
121 if (!no_seq)
122 num_42 = seq_walk(mem, GiB, 42); // Not strictly necessary, but to match mmap path overheads
123 clock_gettime(CLOCK_MONOTONIC_RAW, &stop);
124 if (iters == 1) {
125 printf("Took %ldus via read\n", ((s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec)) / 1000);
126 if (!no_seq)
127 printf("Read %d 42s of %ld expected\n", num_42, GiB/4096);
128 } else {
129 printf("%ld, ", ((s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec)) / 1000);
130 }
131 close(fd);
132 free(mem);
133 }
134 if (iters > 1)
135 printf("\n");
136 return 0;
137}