constant_cycles_kernel.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

/* Copyright 2021-2023 Joshua Bakita
 * Simple kernel that spins on the GPU for a specified number of iterations,
 * while tracking and printing the necessary CPU time.
 */
#include <time.h>
#include <stdio.h>
#include <cuda.h>

#include "testbench.h"

__global__ void loop_on_gpu(unsigned long iters, int *__unused) {
	for (volatile int i = 0; i < iters; i++) (*__unused)++;
}

int main(int argc, char **argv) {
	int res, *__unused;
	struct timespec start, end;

	if (argc != 2 || !strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) {
		fprintf(stderr, "Usage: %s <# of millions of iterations, or -1 for infinite>\n",
		        argv[0]);
		return 1;
	}

	// Input is multiplied by one million, unless infinite
	unsigned long num_iters = strtoul(argv[1], NULL, 10);
	if (num_iters != (unsigned long)(-1))
		num_iters *= 1000 * 1000;

	// Initialize CUDA and a context (hack)
	SAFE(cudaMalloc(&__unused, 8));

	// Run iterations on a single thread
	clock_gettime(CLOCK_MONOTONIC_RAW, &start);
	loop_on_gpu<<<1,1>>>(num_iters, __unused);
	SAFE(cudaGetLastError() /* Check successful launch */);
	SAFE(cudaDeviceSynchronize());
	clock_gettime(CLOCK_MONOTONIC_RAW, &end);

	// Print detailed timing information
	long elapsed = timediff(start, end);
	fprintf(stderr, "Started at %ld ns, ended at %ld ns\n",
	        s2ns(start.tv_sec) + start.tv_nsec, s2ns(end.tv_sec) + end.tv_nsec);
	fprintf(stderr, "%ld ns (%.2f ms) elapsed\n", elapsed, elapsed / (1000 * 1000.));

	// Verify success (fool optimizer)
	SAFE(cudaMemcpy(&res, __unused, 8, cudaMemcpyDeviceToHost));
	// Theoretically this can happen if `__unused` wraps around; maybe for a
	// very small `long` type, or a very long run. More likely indicates an error.
	if (!res)
		fprintf(stderr, "CRITICAL: Zero iterations seem completed. Likely incorrect "
		        "arguments, internal error, or corruption of CUDA internal state.\n");

	return 0;
}