diff options
author | Joshua Bakita <jbakita@cs.unc.edu> | 2022-09-12 10:47:56 -0400 |
---|---|---|
committer | Joshua Bakita <jbakita@cs.unc.edu> | 2022-09-12 10:47:56 -0400 |
commit | a6286e09f4a3c78522a12b3d55b53ef1245bf558 (patch) | |
tree | 1f20908883b3c4989d51bc66b655bfe258cba15d |
Initial commit of tools as used for submission.
-rw-r--r-- | bomb.c | 9 | ||||
-rw-r--r-- | directio_paging_speed.c | 129 | ||||
-rw-r--r-- | gpu_paging_evil_task.cu | 138 | ||||
-rw-r--r-- | gpu_paging_overhead_speed.cu | 168 | ||||
-rw-r--r-- | gpu_paging_speed.cu | 136 | ||||
-rw-r--r-- | paging_speed.c | 137 |
6 files changed, 717 insertions, 0 deletions
@@ -0,0 +1,9 @@ | |||
1 | #include <stdint.h> | ||
2 | #include <stdlib.h> | ||
3 | |||
4 | #define SZ 1024ull*1024ull*1024ull*16ull | ||
5 | int main() { | ||
6 | char* ptr = malloc(SZ); | ||
7 | for (uint64_t i = 0; i < SZ; i+=4096) | ||
8 | ptr[i] = i; | ||
9 | } | ||
diff --git a/directio_paging_speed.c b/directio_paging_speed.c new file mode 100644 index 0000000..b0a01d3 --- /dev/null +++ b/directio_paging_speed.c | |||
@@ -0,0 +1,129 @@ | |||
1 | #define _GNU_SOURCE | ||
2 | |||
3 | #include <sys/mman.h> | ||
4 | #include <sys/types.h> | ||
5 | #include <sys/stat.h> | ||
6 | #include <fcntl.h> | ||
7 | #include <stdio.h> | ||
8 | #include <stdint.h> | ||
9 | #include <time.h> | ||
10 | #include <unistd.h> | ||
11 | #include <stdlib.h> | ||
12 | |||
13 | #define GiB 1024l*1024l*1024l | ||
14 | #define s2ns(s) ((s)*1000l*1000l*1000l) | ||
15 | #define ns2us(ns) ((ns)/1000l) | ||
16 | #define PAGED_FILE "/dev/nvme0n1" | ||
17 | int max(int x, int y) {return x > y ? x : y;} | ||
18 | |||
19 | // Original function from copy_only.cu | ||
20 | void fill_rand(char* buf, uint64_t buf_len) { | ||
21 | uint64_t i = 0; | ||
22 | for (; i < buf_len; i++) | ||
23 | buf[i] = max((rand() & 0xff), 1); | ||
24 | } | ||
25 | |||
26 | uint64_t count_zero(char* buf, uint64_t buf_len) { | ||
27 | uint64_t i = 0; | ||
28 | uint64_t num_zeros = 0; | ||
29 | for (; i < buf_len; i++) | ||
30 | num_zeros += (!buf[i]); | ||
31 | return num_zeros; | ||
32 | } | ||
33 | |||
34 | long time_diff_ns(struct timespec start, struct timespec stop) { | ||
35 | return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec); | ||
36 | } | ||
37 | |||
38 | int main(int argc, char **argv) { | ||
39 | struct timespec out_start, out_stop, in_start, in_stop; | ||
40 | int iters = 1; | ||
41 | int res; | ||
42 | if (argc > 1) | ||
43 | iters = atoi(argv[1]); | ||
44 | |||
45 | // Needed to allow page cache clearing between iterations | ||
46 | // Note: Shouldn't be needed with O_DIRECT, but include it just in case | ||
47 | int clear_fd = open("/proc/sys/vm/drop_caches", O_WRONLY); | ||
48 | if (clear_fd == -1) { | ||
49 | perror("Unable to open /proc/sys/vm/drop_caches"); | ||
50 | return 1; | ||
51 | } | ||
52 | char clear_cmd = '3'; | ||
53 | |||
54 | printf("out (us)\tin (us)\n"); | ||
55 | for (int i = 0; i < iters; i++) { | ||
56 | char *mem_in, *mem_out; | ||
57 | int fd = open(PAGED_FILE, O_RDWR | O_DIRECT | O_SYNC); | ||
58 | if (fd == -1) { | ||
59 | perror("Unable to open " PAGED_FILE); | ||
60 | return 1; | ||
61 | } | ||
62 | // Clear page cache | ||
63 | write(clear_fd, &clear_cmd, 1); | ||
64 | // Allocate and fill a buffer with random data | ||
65 | // Aligned malloc(GiB) basicially | ||
66 | res = posix_memalign((void**)&mem_in, 4096, GiB); | ||
67 | fill_rand(mem_in, GiB); | ||
68 | |||
69 | // Write and free buffer | ||
70 | clock_gettime(CLOCK_MONOTONIC_RAW, &out_start); | ||
71 | res = write(fd, mem_in, GiB); | ||
72 | free(mem_in); | ||
73 | clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop); | ||
74 | if (res == -1) { | ||
75 | perror("Unable to write 1GiB to " PAGED_FILE); | ||
76 | return 1; | ||
77 | } | ||
78 | if (res != GiB) { | ||
79 | fprintf(stderr, "Unable to write the buffer all at once!"); | ||
80 | return 2; | ||
81 | } | ||
82 | |||
83 | sleep(1); // Supposedly some other work would happen here | ||
84 | write(clear_fd, &clear_cmd, 1); // Just in case O_DIRECT misbehaves | ||
85 | res = lseek(fd, 0, SEEK_SET); // Reposition offset | ||
86 | if (res == -1) { | ||
87 | perror("Unable to seek to offset 0 in " PAGED_FILE); | ||
88 | return 1; | ||
89 | } | ||
90 | if (res != 0) { | ||
91 | fprintf(stderr, "Unable to seek to offset 0 in " PAGED_FILE); | ||
92 | return 2; | ||
93 | } | ||
94 | |||
95 | // Allocate and read buffer | ||
96 | clock_gettime(CLOCK_MONOTONIC_RAW, &in_start); | ||
97 | // Aligned malloc(GiB) basicially | ||
98 | res = posix_memalign((void**)&mem_out, 4096, GiB); | ||
99 | if (res) { | ||
100 | fprintf(stderr, "posix_memalign() failure. Error %d.", res); | ||
101 | return 1; | ||
102 | } | ||
103 | res = read(fd, mem_out, GiB); | ||
104 | clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop); | ||
105 | if (res == -1) { | ||
106 | perror("Unable to read 1GiB from " PAGED_FILE); | ||
107 | return 1; | ||
108 | } | ||
109 | if (res < GiB) { | ||
110 | fprintf(stderr, "Unable to read the buffer all at once!"); | ||
111 | return 2; | ||
112 | } | ||
113 | |||
114 | // Check for valid contents | ||
115 | // TODO: Use CRC32 or something else a bit less dumb | ||
116 | res = count_zero(mem_out, GiB); | ||
117 | if (res > 0) { | ||
118 | fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after I/O!\n", res); | ||
119 | return 1; | ||
120 | } | ||
121 | |||
122 | // Print results as tab-seperated-values | ||
123 | printf("%ld\t%ld\n", ns2us(time_diff_ns(out_start, out_stop)), | ||
124 | ns2us(time_diff_ns(in_start, in_stop))); | ||
125 | close(fd); | ||
126 | free(mem_out); | ||
127 | } | ||
128 | return 0; | ||
129 | } | ||
diff --git a/gpu_paging_evil_task.cu b/gpu_paging_evil_task.cu new file mode 100644 index 0000000..7c1ab59 --- /dev/null +++ b/gpu_paging_evil_task.cu | |||
@@ -0,0 +1,138 @@ | |||
1 | #include <stdio.h> | ||
2 | #include <cuda.h> | ||
3 | #include <curand_kernel.h> // curandState_t and curand | ||
4 | #include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h" | ||
5 | #include <errno.h> | ||
6 | #include <time.h> // clock_gettime | ||
7 | #include <sys/ioctl.h> // ioctl | ||
8 | #include <unistd.h> // sleep | ||
9 | |||
10 | #define s2ns(s) ((s)*1000l*1000l*1000l) | ||
11 | #define ns2us(ns) ((ns)/1000l) | ||
12 | #define GiB 1024l*1024l*1024l | ||
13 | |||
14 | // Originally from copy_testbed.h in the copy_experiments set | ||
15 | #define SAFE(x) \ | ||
16 | if ((err = (cudaError_t)(x)) != 0) { \ | ||
17 | printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \ | ||
18 | printf("Suspect line: %s\n", #x); \ | ||
19 | exit(1); \ | ||
20 | } | ||
21 | |||
22 | // Fill buffer with random bytes. Supports buffers >4GiB. | ||
23 | // Original function on CPU part of copy_only.cu | ||
24 | // @param buf Pointer to buffer | ||
25 | // @param buf_len Length of buffer in bytes | ||
26 | // @note Supports splitting the work across threads | ||
27 | __device__ curandState_t rng_state; | ||
28 | __global__ void fill_rand(char* buf, uint64_t buf_len) { | ||
29 | uint64_t to = buf_len; | ||
30 | uint64_t i = 0; | ||
31 | if (blockDim.x > 1) { | ||
32 | // Subdivide the work | ||
33 | uint64_t chunk_sz = buf_len/blockDim.x; | ||
34 | i = threadIdx.x * chunk_sz; | ||
35 | to = threadIdx.x * chunk_sz + chunk_sz; | ||
36 | // If buffer size doesn't evenly divide, make last thread get remaineder | ||
37 | if (threadIdx.x + 1 == blockDim.x) { | ||
38 | to = buf_len; | ||
39 | } | ||
40 | } | ||
41 | for (; i < to; i++) | ||
42 | buf[i] = max((curand(&rng_state) & 0xff), 1); | ||
43 | } | ||
44 | |||
45 | // Fill buffer with sequential quadwords | ||
46 | // @param buf Pointer to buffer | ||
47 | // @param buf_len Length of buffer in bytes | ||
48 | __global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) { | ||
49 | uint64_t i; | ||
50 | for (i = 0; i < buf_len; i++) | ||
51 | buf[i] = start_num++; | ||
52 | } | ||
53 | |||
54 | __device__ uint64_t gpu_res; | ||
55 | // Count number of zeros in a buffer | ||
56 | // @param buf Pointer to buffer | ||
57 | // @param buf_len Length of buffer in bytes | ||
58 | // @return via gpu_res Number of zeros found | ||
59 | // @note Supports splitting the work across threads | ||
60 | __global__ void count_zero(char* buf, uint64_t buf_len) { | ||
61 | gpu_res = 0; | ||
62 | uint64_t to = buf_len; | ||
63 | uint64_t i = 0; | ||
64 | if (blockDim.x > 1) { | ||
65 | // Subdivide the work | ||
66 | uint64_t chunk_sz = buf_len/blockDim.x; | ||
67 | i = threadIdx.x * chunk_sz; | ||
68 | to = threadIdx.x * chunk_sz + chunk_sz; | ||
69 | // If buffer size doesn't evenly divide, make last thread get remaineder | ||
70 | if (threadIdx.x + 1 == blockDim.x) { | ||
71 | to = buf_len; | ||
72 | } | ||
73 | } | ||
74 | uint64_t num_zero; | ||
75 | for (; i < to; i++) | ||
76 | num_zero += (!buf[i]); | ||
77 | // Cast shouldn't strictly be needed, but won't build without... | ||
78 | atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero); | ||
79 | } | ||
80 | |||
81 | long time_diff_ns(struct timespec start, struct timespec stop) { | ||
82 | return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec); | ||
83 | } | ||
84 | |||
85 | int main(int argc, char **argv) { | ||
86 | char* gpu_buf; | ||
87 | struct timespec out_start, out_stop; | ||
88 | int res; | ||
89 | cudaStream_t stream1; | ||
90 | cudaError_t err; | ||
91 | int iters; | ||
92 | if (argc != 2) { | ||
93 | fprintf(stderr, "Usage: %s <iterations>\n", argv[0]); | ||
94 | return 1; | ||
95 | } | ||
96 | iters = atoi(argv[1]); | ||
97 | SAFE(cudaStreamCreate(&stream1)); | ||
98 | SAFE(cudaMalloc(&gpu_buf, GiB)); | ||
99 | |||
100 | // Fill buffer with data | ||
101 | fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB); | ||
102 | SAFE(cudaStreamSynchronize(stream1)); | ||
103 | |||
104 | clock_gettime(CLOCK_MONOTONIC_RAW, &out_start); | ||
105 | |||
106 | for (int i = 0; i < iters; i++) { | ||
107 | // Copy out | ||
108 | struct nvgpu_as_swap_buffer_args ioctl_arg = {1160}; | ||
109 | res = ioctl(6, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER, &ioctl_arg); | ||
110 | if (res) { | ||
111 | perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER"); | ||
112 | return res; | ||
113 | } | ||
114 | |||
115 | // Copy in | ||
116 | res = ioctl(6, NVGPU_AS_IOCTL_READ_SWAP_BUFFER, &ioctl_arg); | ||
117 | if (res) { | ||
118 | perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER"); | ||
119 | return res; | ||
120 | } | ||
121 | } | ||
122 | |||
123 | clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop); | ||
124 | |||
125 | // Check for valid contents | ||
126 | count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB); | ||
127 | SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost)); | ||
128 | SAFE(cudaStreamSynchronize(stream1)); | ||
129 | if (res > 0) { | ||
130 | fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res); | ||
131 | return 1; | ||
132 | } | ||
133 | |||
134 | long duration = ns2us(time_diff_ns(out_start, out_stop)); | ||
135 | printf("Took %ldus to do %d paging loops (%.2f us per loop)\n", duration, iters, duration/(float)iters); | ||
136 | |||
137 | cudaFree(gpu_buf); | ||
138 | } | ||
diff --git a/gpu_paging_overhead_speed.cu b/gpu_paging_overhead_speed.cu new file mode 100644 index 0000000..c7a0f3a --- /dev/null +++ b/gpu_paging_overhead_speed.cu | |||
@@ -0,0 +1,168 @@ | |||
1 | #include <stdio.h> | ||
2 | #include <cuda.h> | ||
3 | #include <curand_kernel.h> // curandState_t and curand | ||
4 | #include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h" | ||
5 | #include <errno.h> | ||
6 | #include <time.h> // clock_gettime | ||
7 | #include <sys/ioctl.h> // ioctl | ||
8 | #include <unistd.h> // sleep | ||
9 | |||
10 | #define s2ns(s) ((s)*1000l*1000l*1000l) | ||
11 | #define ns2us(ns) ((ns)/1000l) | ||
12 | #define GiB 1024l*1024l*1024l | ||
13 | |||
14 | // Originally from copy_testbed.h in the copy_experiments set | ||
15 | #define SAFE(x) \ | ||
16 | if ((err = (cudaError_t)(x)) != 0) { \ | ||
17 | printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \ | ||
18 | printf("Suspect line: %s\n", #x); \ | ||
19 | exit(1); \ | ||
20 | } | ||
21 | |||
22 | // Fill buffer with random bytes. Supports buffers >4GiB. | ||
23 | // Original function on CPU part of copy_only.cu | ||
24 | // @param buf Pointer to buffer | ||
25 | // @param buf_len Length of buffer in bytes | ||
26 | // @note Supports splitting the work across threads | ||
27 | __device__ curandState_t rng_state; | ||
28 | __global__ void fill_rand(char* buf, uint64_t buf_len) { | ||
29 | uint64_t to = buf_len; | ||
30 | uint64_t i = 0; | ||
31 | if (blockDim.x > 1) { | ||
32 | // Subdivide the work | ||
33 | uint64_t chunk_sz = buf_len/blockDim.x; | ||
34 | i = threadIdx.x * chunk_sz; | ||
35 | to = threadIdx.x * chunk_sz + chunk_sz; | ||
36 | // If buffer size doesn't evenly divide, make last thread get remaineder | ||
37 | if (threadIdx.x + 1 == blockDim.x) { | ||
38 | to = buf_len; | ||
39 | } | ||
40 | } | ||
41 | for (; i < to; i++) | ||
42 | buf[i] = max((curand(&rng_state) & 0xff), 1); | ||
43 | } | ||
44 | |||
45 | // Fill buffer with sequential quadwords | ||
46 | // @param buf Pointer to buffer | ||
47 | // @param buf_len Length of buffer in bytes | ||
48 | __global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) { | ||
49 | uint64_t i; | ||
50 | for (i = 0; i < buf_len; i++) | ||
51 | buf[i] = start_num++; | ||
52 | } | ||
53 | |||
54 | __device__ uint64_t gpu_res; | ||
55 | // Count number of zeros in a buffer | ||
56 | // @param buf Pointer to buffer | ||
57 | // @param buf_len Length of buffer in bytes | ||
58 | // @return via gpu_res Number of zeros found | ||
59 | // @note Supports splitting the work across threads | ||
60 | __global__ void count_zero(char* buf, uint64_t buf_len) { | ||
61 | gpu_res = 0; | ||
62 | uint64_t to = buf_len; | ||
63 | uint64_t i = 0; | ||
64 | if (blockDim.x > 1) { | ||
65 | // Subdivide the work | ||
66 | uint64_t chunk_sz = buf_len/blockDim.x; | ||
67 | i = threadIdx.x * chunk_sz; | ||
68 | to = threadIdx.x * chunk_sz + chunk_sz; | ||
69 | // If buffer size doesn't evenly divide, make last thread get remaineder | ||
70 | if (threadIdx.x + 1 == blockDim.x) { | ||
71 | to = buf_len; | ||
72 | } | ||
73 | } | ||
74 | uint64_t num_zero; | ||
75 | for (; i < to; i++) | ||
76 | num_zero += (!buf[i]); | ||
77 | // Cast shouldn't strictly be needed, but won't build without... | ||
78 | atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero); | ||
79 | } | ||
80 | |||
81 | long time_diff_ns(struct timespec start, struct timespec stop) { | ||
82 | return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec); | ||
83 | } | ||
84 | |||
85 | #define AS_FD 6 | ||
86 | // Use 8 if running with perf | ||
87 | //#define AS_FD 8 | ||
88 | |||
89 | int main(int argc, char **argv) { | ||
90 | char* gpu_buf; | ||
91 | struct timespec out_start, out_stop, in_start, in_stop; | ||
92 | struct timespec out_start2, out_stop2, in_start2, in_stop2; | ||
93 | int res; | ||
94 | cudaStream_t stream1; | ||
95 | cudaError_t err; | ||
96 | SAFE(cudaStreamCreate(&stream1)); | ||
97 | SAFE(cudaMalloc(&gpu_buf, GiB)); | ||
98 | |||
99 | // Fill buffer with data | ||
100 | fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB); | ||
101 | SAFE(cudaStreamSynchronize(stream1)); | ||
102 | |||
103 | // Reset sector assignments (does not fail) | ||
104 | ioctl(AS_FD, NVGPU_AS_IOCTL_SWAP_RESET); | ||
105 | |||
106 | // Copy out | ||
107 | struct nvgpu_as_swap_buffer_args ioctl_arg = {1160}; | ||
108 | //struct nvgpu_as_swap_buffer_args ioctl_arg = {NVGPU_SWAP_ALL}; | ||
109 | clock_gettime(CLOCK_MONOTONIC_RAW, &out_start); | ||
110 | res = ioctl(AS_FD, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC, &ioctl_arg); | ||
111 | clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop); | ||
112 | if (res < 0) { | ||
113 | perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC"); | ||
114 | return res; | ||
115 | } | ||
116 | //printf("Num failed: %d\n", res); | ||
117 | |||
118 | sleep(1); | ||
119 | |||
120 | clock_gettime(CLOCK_MONOTONIC_RAW, &out_start2); | ||
121 | res = ioctl(AS_FD, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC_FINISH, &ioctl_arg); | ||
122 | clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop2); | ||
123 | if (res < 0) { | ||
124 | perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER_ASYNC_FINISH"); | ||
125 | return res; | ||
126 | } | ||
127 | //printf("Num failed: %d\n", res); | ||
128 | |||
129 | sleep(1); // Supposedly some other work would happen here | ||
130 | |||
131 | // Copy in | ||
132 | clock_gettime(CLOCK_MONOTONIC_RAW, &in_start); | ||
133 | res = ioctl(AS_FD, NVGPU_AS_IOCTL_READ_SWAP_BUFFER_ASYNC, &ioctl_arg); | ||
134 | clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop); | ||
135 | if (res < 0) { | ||
136 | perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER_ASYNC"); | ||
137 | return res; | ||
138 | } | ||
139 | //printf("Num failed: %d\n", res); | ||
140 | |||
141 | sleep(1); | ||
142 | |||
143 | clock_gettime(CLOCK_MONOTONIC_RAW, &in_start2); | ||
144 | res = ioctl(AS_FD, NVGPU_AS_IOCTL_READ_SWAP_BUFFER_ASYNC_FINISH, &ioctl_arg); | ||
145 | clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop2); | ||
146 | if (res < 0) { | ||
147 | perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER_ASYNC_FINISH"); | ||
148 | return res; | ||
149 | } | ||
150 | //printf("Num failed: %d\n", res); | ||
151 | |||
152 | // Check for valid contents | ||
153 | count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB); | ||
154 | SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost)); | ||
155 | SAFE(cudaStreamSynchronize(stream1)); | ||
156 | if (res > 0) { | ||
157 | fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res); | ||
158 | return 1; | ||
159 | } | ||
160 | |||
161 | // Print results as tab-seperated-values | ||
162 | printf("out_start(us)\tout_fin(us)\tin_start(us)\tout_fin(us)\n"); | ||
163 | printf("%ld\t%ld\t%ld\t%ld\n", ns2us(time_diff_ns(out_start, out_stop)), | ||
164 | ns2us(time_diff_ns(out_start2, out_stop2)), | ||
165 | ns2us(time_diff_ns(in_start, in_stop)), | ||
166 | ns2us(time_diff_ns(in_start2, in_stop2))); | ||
167 | cudaFree(gpu_buf); | ||
168 | } | ||
diff --git a/gpu_paging_speed.cu b/gpu_paging_speed.cu new file mode 100644 index 0000000..72cb82e --- /dev/null +++ b/gpu_paging_speed.cu | |||
@@ -0,0 +1,136 @@ | |||
1 | #include <stdio.h> | ||
2 | #include <cuda.h> | ||
3 | #include <curand_kernel.h> // curandState_t and curand | ||
4 | #include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h" | ||
5 | #include <errno.h> | ||
6 | #include <time.h> // clock_gettime | ||
7 | #include <sys/ioctl.h> // ioctl | ||
8 | #include <unistd.h> // sleep | ||
9 | |||
10 | #define s2ns(s) ((s)*1000l*1000l*1000l) | ||
11 | #define ns2us(ns) ((ns)/1000l) | ||
12 | #define GiB 1024l*1024l*1024l | ||
13 | |||
14 | // Originally from copy_testbed.h in the copy_experiments set | ||
15 | #define SAFE(x) \ | ||
16 | if ((err = (cudaError_t)(x)) != 0) { \ | ||
17 | printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \ | ||
18 | printf("Suspect line: %s\n", #x); \ | ||
19 | exit(1); \ | ||
20 | } | ||
21 | |||
22 | // Fill buffer with random bytes. Supports buffers >4GiB. | ||
23 | // Original function on CPU part of copy_only.cu | ||
24 | // @param buf Pointer to buffer | ||
25 | // @param buf_len Length of buffer in bytes | ||
26 | // @note Supports splitting the work across threads | ||
27 | __device__ curandState_t rng_state; | ||
28 | __global__ void fill_rand(char* buf, uint64_t buf_len) { | ||
29 | uint64_t to = buf_len; | ||
30 | uint64_t i = 0; | ||
31 | if (blockDim.x > 1) { | ||
32 | // Subdivide the work | ||
33 | uint64_t chunk_sz = buf_len/blockDim.x; | ||
34 | i = threadIdx.x * chunk_sz; | ||
35 | to = threadIdx.x * chunk_sz + chunk_sz; | ||
36 | // If buffer size doesn't evenly divide, make last thread get remaineder | ||
37 | if (threadIdx.x + 1 == blockDim.x) { | ||
38 | to = buf_len; | ||
39 | } | ||
40 | } | ||
41 | for (; i < to; i++) | ||
42 | buf[i] = max((curand(&rng_state) & 0xff), 1); | ||
43 | } | ||
44 | |||
45 | // Fill buffer with sequential quadwords | ||
46 | // @param buf Pointer to buffer | ||
47 | // @param buf_len Length of buffer in bytes | ||
48 | __global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) { | ||
49 | uint64_t i; | ||
50 | for (i = 0; i < buf_len; i++) | ||
51 | buf[i] = start_num++; | ||
52 | } | ||
53 | |||
54 | __device__ uint64_t gpu_res; | ||
55 | // Count number of zeros in a buffer | ||
56 | // @param buf Pointer to buffer | ||
57 | // @param buf_len Length of buffer in bytes | ||
58 | // @return via gpu_res Number of zeros found | ||
59 | // @note Supports splitting the work across threads | ||
60 | __global__ void count_zero(char* buf, uint64_t buf_len) { | ||
61 | gpu_res = 0; | ||
62 | uint64_t to = buf_len; | ||
63 | uint64_t i = 0; | ||
64 | if (blockDim.x > 1) { | ||
65 | // Subdivide the work | ||
66 | uint64_t chunk_sz = buf_len/blockDim.x; | ||
67 | i = threadIdx.x * chunk_sz; | ||
68 | to = threadIdx.x * chunk_sz + chunk_sz; | ||
69 | // If buffer size doesn't evenly divide, make last thread get remaineder | ||
70 | if (threadIdx.x + 1 == blockDim.x) { | ||
71 | to = buf_len; | ||
72 | } | ||
73 | } | ||
74 | uint64_t num_zero; | ||
75 | for (; i < to; i++) | ||
76 | num_zero += (!buf[i]); | ||
77 | // Cast shouldn't strictly be needed, but won't build without... | ||
78 | atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero); | ||
79 | } | ||
80 | |||
81 | long time_diff_ns(struct timespec start, struct timespec stop) { | ||
82 | return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec); | ||
83 | } | ||
84 | |||
85 | int main(int argc, char **argv) { | ||
86 | char* gpu_buf; | ||
87 | struct timespec out_start, out_stop, in_start, in_stop; | ||
88 | int res; | ||
89 | cudaStream_t stream1; | ||
90 | cudaError_t err; | ||
91 | SAFE(cudaStreamCreate(&stream1)); | ||
92 | SAFE(cudaMalloc(&gpu_buf, GiB)); | ||
93 | |||
94 | // Fill buffer with data | ||
95 | fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB); | ||
96 | SAFE(cudaStreamSynchronize(stream1)); | ||
97 | |||
98 | // Reset sector assignments (does not fail) | ||
99 | ioctl(6, NVGPU_AS_IOCTL_SWAP_RESET); | ||
100 | |||
101 | // Copy out | ||
102 | struct nvgpu_as_swap_buffer_args ioctl_arg = {1160}; | ||
103 | clock_gettime(CLOCK_MONOTONIC_RAW, &out_start); | ||
104 | res = ioctl(6, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER, &ioctl_arg); | ||
105 | clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop); | ||
106 | if (res) { | ||
107 | perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER"); | ||
108 | return res; | ||
109 | } | ||
110 | |||
111 | sleep(1); // Supposedly some other work would happen here | ||
112 | |||
113 | // Copy in | ||
114 | clock_gettime(CLOCK_MONOTONIC_RAW, &in_start); | ||
115 | res = ioctl(6, NVGPU_AS_IOCTL_READ_SWAP_BUFFER, &ioctl_arg); | ||
116 | clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop); | ||
117 | if (res) { | ||
118 | perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER"); | ||
119 | return res; | ||
120 | } | ||
121 | |||
122 | // Check for valid contents | ||
123 | count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB); | ||
124 | SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost)); | ||
125 | SAFE(cudaStreamSynchronize(stream1)); | ||
126 | if (res > 0) { | ||
127 | fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res); | ||
128 | return 1; | ||
129 | } | ||
130 | |||
131 | // Print results as tab-seperated-values | ||
132 | printf("out (us)\tin (us)\n"); | ||
133 | printf("%ld\t%ld\n", ns2us(time_diff_ns(out_start, out_stop)), | ||
134 | ns2us(time_diff_ns(in_start, in_stop))); | ||
135 | cudaFree(gpu_buf); | ||
136 | } | ||
diff --git a/paging_speed.c b/paging_speed.c new file mode 100644 index 0000000..4ad56e2 --- /dev/null +++ b/paging_speed.c | |||
@@ -0,0 +1,137 @@ | |||
1 | #define _GNU_SOURCE | ||
2 | |||
3 | #include <sys/mman.h> | ||
4 | #include <sys/types.h> | ||
5 | #include <sys/stat.h> | ||
6 | #include <fcntl.h> | ||
7 | #include <stdio.h> | ||
8 | #include <stdint.h> | ||
9 | #include <time.h> | ||
10 | #include <unistd.h> | ||
11 | #include <stdlib.h> | ||
12 | #include <string.h> // strlen() | ||
13 | |||
14 | #define GiB 1024l*1024l*1024l | ||
15 | #define s2ns(s) ((s)*1000l*1000l*1000l) | ||
16 | |||
17 | |||
18 | int seq_walk(char* mem, int len, char to_find) { | ||
19 | int num_42 = 0; | ||
20 | // Stride of 4096 bytes (one 4k page) | ||
21 | for (int i = 4096; i < len; i += 4096) | ||
22 | if (mem[i] == to_find) | ||
23 | num_42++; | ||
24 | return num_42; | ||
25 | } | ||
26 | |||
27 | long time_diff_ns(struct timespec start, struct timespec stop) { | ||
28 | return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec); | ||
29 | } | ||
30 | // TODO: take *num_42, return time | ||
31 | |||
32 | //#define PAGED_FILE "/home/jbakita/1gib_random_f" | ||
33 | #define PAGED_FILE "/dev/nvme0n1" | ||
34 | |||
35 | int main(int argc, char **argv) { | ||
36 | int iters = 1; | ||
37 | int no_seq = 0; | ||
38 | if (argc > 1) | ||
39 | iters = atoi(argv[1]); | ||
40 | if (argc > 2) { | ||
41 | no_seq = strncmp(argv[2], "--no-seq", strlen(argv[2])) ? 1 : 0; | ||
42 | fprintf(stderr, "Skipping seq, but using no-seq emulation with demand paging\n"); | ||
43 | } | ||
44 | struct timespec start, stop, seq_stop; | ||
45 | int clear_fd = open("/proc/sys/vm/drop_caches", O_WRONLY); | ||
46 | if (clear_fd == -1) { | ||
47 | perror("Unable to open /proc/sys/vm/drop_caches"); | ||
48 | return 1; | ||
49 | } | ||
50 | |||
51 | char clear_cmd = '3'; | ||
52 | for (int i = 0; i < iters; i++) { | ||
53 | int fd = open(PAGED_FILE, O_RDWR); | ||
54 | if (fd == -1) { | ||
55 | perror("Unable to open " PAGED_FILE); | ||
56 | return 1; | ||
57 | } | ||
58 | // Clear page cache | ||
59 | write(clear_fd, &clear_cmd, 1); | ||
60 | // VIA MMAP | ||
61 | clock_gettime(CLOCK_MONOTONIC_RAW, &start); | ||
62 | char* mem = mmap(NULL, GiB, PROT_READ, MAP_PRIVATE, fd, 0); | ||
63 | if (mem == MAP_FAILED) { | ||
64 | perror("Unable to mmap " PAGED_FILE); | ||
65 | return 1; | ||
66 | } | ||
67 | // Fault on all the pages via a sequential walk | ||
68 | int num_42 = seq_walk(mem, GiB, 42); | ||
69 | clock_gettime(CLOCK_MONOTONIC_RAW, &stop); | ||
70 | int num_52 = 0; | ||
71 | if (no_seq) | ||
72 | num_52 = seq_walk(mem, GiB, 52); | ||
73 | clock_gettime(CLOCK_MONOTONIC_RAW, &seq_stop); | ||
74 | if (num_52) | ||
75 | fprintf(stderr, "Something is seriously wrong! Found a 52 in a buffer that should be only 42s\n"); | ||
76 | long duration = (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec); | ||
77 | // Emulate the time demand paging would take if we didn't have to walk | ||
78 | if (no_seq) { | ||
79 | long seq_time = time_diff_ns(stop, seq_stop); | ||
80 | duration -= seq_time; | ||
81 | } | ||
82 | if (iters == 1) { | ||
83 | printf("Took %ldus via mmap\n", duration / 1000); | ||
84 | printf("Read %d 42s of %ld expected\n", num_42, GiB/4096); | ||
85 | } else { | ||
86 | printf("%ld, ", duration / 1000); | ||
87 | } | ||
88 | munmap(mem, GiB); | ||
89 | close(fd); | ||
90 | } | ||
91 | if (iters > 1) | ||
92 | printf("\n"); | ||
93 | |||
94 | for (int i = 0; i < iters; i++) { | ||
95 | char* mem; | ||
96 | int fd = open(PAGED_FILE, O_RDWR | O_DIRECT); | ||
97 | if (fd == -1) { | ||
98 | perror("Unable to open " PAGED_FILE); | ||
99 | return 1; | ||
100 | } | ||
101 | // Clear page cache | ||
102 | write(clear_fd, &clear_cmd, 1); | ||
103 | // VIA READ | ||
104 | clock_gettime(CLOCK_MONOTONIC_RAW, &start); | ||
105 | // Aligned malloc(GiB) basicially | ||
106 | int res = posix_memalign((void**)&mem, 4096, GiB); | ||
107 | if (res) { | ||
108 | fprintf(stderr, "posix_memalign() failure. Error %d.", res); | ||
109 | return 1; | ||
110 | } | ||
111 | res = read(fd, mem, GiB); | ||
112 | if (res == -1) { | ||
113 | perror("Unable to read 1GiB from /dev/nvme0n1"); | ||
114 | return 1; | ||
115 | } | ||
116 | if (res < GiB) { | ||
117 | fprintf(stderr, "Unable to read the buffer all at once!"); | ||
118 | return 2; | ||
119 | } | ||
120 | int num_42 = 0; | ||
121 | if (!no_seq) | ||
122 | num_42 = seq_walk(mem, GiB, 42); // Not strictly necessary, but to match mmap path overheads | ||
123 | clock_gettime(CLOCK_MONOTONIC_RAW, &stop); | ||
124 | if (iters == 1) { | ||
125 | printf("Took %ldus via read\n", ((s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec)) / 1000); | ||
126 | if (!no_seq) | ||
127 | printf("Read %d 42s of %ld expected\n", num_42, GiB/4096); | ||
128 | } else { | ||
129 | printf("%ld, ", ((s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec)) / 1000); | ||
130 | } | ||
131 | close(fd); | ||
132 | free(mem); | ||
133 | } | ||
134 | if (iters > 1) | ||
135 | printf("\n"); | ||
136 | return 0; | ||
137 | } | ||