gpu_paging_speed.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

/**
 * Copyright 2022 Joshua Bakita
 */
#include <stdio.h>
#include <cuda.h>
#include <curand_kernel.h> // curandState_t and curand
#include "/home/jbakita/kernel/nvgpu/include/uapi/linux/nvgpu.h"
#include <errno.h>
#include <time.h> // clock_gettime
#include <sys/ioctl.h> // ioctl
#include <unistd.h> // sleep

#define s2ns(s) ((s)*1000l*1000l*1000l)
#define ns2us(ns) ((ns)/1000l)
#define GiB 1024l*1024l*1024l

// Originally from copy_testbed.h in the copy_experiments set
#define SAFE(x) \
        if ((err = (cudaError_t)(x)) != 0) { \
                printf("CUDA error %d! %s\n", err, cudaGetErrorString(err)); \
                printf("Suspect line: %s\n", #x); \
                exit(1); \
        }

// Fill buffer with random bytes. Supports buffers >4GiB.
// Original function on CPU part of copy_only.cu
// @param buf     Pointer to buffer
// @param buf_len Length of buffer in bytes
// @note Supports splitting the work across threads
__device__ curandState_t rng_state;
__global__ void fill_rand(char* buf, uint64_t buf_len) {
	uint64_t to = buf_len;
	uint64_t i = 0;
	if (blockDim.x > 1) {
		// Subdivide the work
		uint64_t chunk_sz = buf_len/blockDim.x;
		i = threadIdx.x * chunk_sz;
		to = threadIdx.x * chunk_sz + chunk_sz;
		// If buffer size doesn't evenly divide, make last thread get remaineder
		if (threadIdx.x + 1 == blockDim.x) {
			to = buf_len;
		}
	}
	for (; i < to; i++)
		buf[i] = max((curand(&rng_state) & 0xff), 1);
}

// Fill buffer with sequential quadwords
// @param buf     Pointer to buffer
// @param buf_len Length of buffer in bytes
__global__ void fill_seq(uint32_t* buf, uint64_t buf_len, uint64_t start_num) {
	uint64_t i;
	for (i = 0; i < buf_len; i++)
		buf[i] = start_num++;
}

__device__ uint64_t gpu_res;
// Count number of zeros in a buffer
// @param buf     Pointer to buffer
// @param buf_len Length of buffer in bytes
// @return via gpu_res Number of zeros found
// @note Supports splitting the work across threads
__global__ void count_zero(char* buf, uint64_t buf_len) {
	gpu_res = 0;
	uint64_t to = buf_len;
	uint64_t i = 0;
	if (blockDim.x > 1) {
		// Subdivide the work
		uint64_t chunk_sz = buf_len/blockDim.x;
		i = threadIdx.x * chunk_sz;
		to = threadIdx.x * chunk_sz + chunk_sz;
		// If buffer size doesn't evenly divide, make last thread get remaineder
		if (threadIdx.x + 1 == blockDim.x) {
			to = buf_len;
		}
	}
	uint64_t num_zero;
	for (; i < to; i++)
		num_zero += (!buf[i]);
	// Cast shouldn't strictly be needed, but won't build without...
	atomicAdd_block((unsigned long long int*)&gpu_res, (unsigned long long int)num_zero);
}

// Subtract first parameter from second parameter. Return as nanoseconds.
long time_diff_ns(struct timespec start, struct timespec stop) {
	return (s2ns(stop.tv_sec) + stop.tv_nsec) - (s2ns(start.tv_sec) + start.tv_nsec);
}

int main(int argc, char **argv) {
	char* gpu_buf;
	struct timespec out_start, out_stop, in_start, in_stop;
	int res;
	cudaStream_t stream1;
	cudaError_t err;
	SAFE(cudaStreamCreate(&stream1));
	SAFE(cudaMalloc(&gpu_buf, GiB));

	// Fill buffer with data
	fill_rand<<<1,512,0,stream1>>>(gpu_buf, GiB);
	SAFE(cudaStreamSynchronize(stream1));

        // Reset sector assignments (does not fail)
        ioctl(6, NVGPU_AS_IOCTL_SWAP_RESET);

	// Copy out
	struct nvgpu_as_swap_buffer_args ioctl_arg = {1160};
	clock_gettime(CLOCK_MONOTONIC_RAW, &out_start);
	res = ioctl(6, NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER, &ioctl_arg);
	clock_gettime(CLOCK_MONOTONIC_RAW, &out_stop);
	if (res) {
		perror("Error in NVMAP_AS_IOCTL_WRITE_SWAP_BUFFER");
		return res;
	}

	sleep(1); // Supposedly some other work would happen here

	// Copy in
	clock_gettime(CLOCK_MONOTONIC_RAW, &in_start);
	res = ioctl(6, NVGPU_AS_IOCTL_READ_SWAP_BUFFER, &ioctl_arg);
	clock_gettime(CLOCK_MONOTONIC_RAW, &in_stop);
	if (res) {
		perror("Error in NVMAP_AS_IOCTL_READ_SWAP_BUFFER");
		return res;
	}

	// Check for valid contents
	count_zero<<<1,512,0,stream1>>>(gpu_buf, GiB);
	SAFE(cudaMemcpyFromSymbol(&res, gpu_res, sizeof(unsigned long), 0, cudaMemcpyDeviceToHost));
	SAFE(cudaStreamSynchronize(stream1));
	if (res > 0) {
		fprintf(stderr, "Error: Found %d zeros in supposedly non-zero buffer after paging!\n", res);
		return 1;
	}

	// Print results as tab-seperated-values
	printf("out (us)\tin (us)\n");
	printf("%ld\t%ld\n", ns2us(time_diff_ns(out_start, out_stop)),
			     ns2us(time_diff_ns(in_start, in_stop)));
	cudaFree(gpu_buf);
}