Add measure_launch_oh benchmark and make target

Benchmark supports: => Warms up the GPU and kernel code before sampling => Uses a flag in zero-copy memory to track when a kernel begins => Outputs summary statistics to stderr, and sample data to stdout
author: Joshua Bakita <jbakita@cs.unc.edu> 2024-02-26 20:29:25 -0500
committer: Joshua Bakita <jbakita@cs.unc.edu> 2024-02-26 20:29:25 -0500
commit: 340457bd5dae3624d4c92a02b84fb8206c0348a9 (patch)
tree: 561baa1d81e9a41b48066f547435e43613106bc4
parent: 5c65954998591bc61a4138024ba4895bed64a8a6 (diff)
3 files changed, 68 insertions, 2 deletions
diff --git a/.gitignore b/.gitignore
index b0d597a..7cfda47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 **/.gdb_history
 preemption_logger
 constant_cycles_kernel
+measure_launch_oh
 copy_experiments/copy_contender
 copy_experiments/mon_cross_ctx_copies
diff --git a/Makefile b/Makefile
index 688d73d..1956d72 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 .PHONY: all copy_experiments clean
 NVCC ?= nvcc
-all: constant_cycles_kernel preemption_logger copy_experiments
+all: constant_cycles_kernel preemption_logger copy_experiments measure_launch_oh
 constant_cycles_kernel: constant_cycles_kernel.cu testbench.h
        $(NVCC) constant_cycles_kernel.cu -o constant_cycles_kernel
@@ -9,9 +9,12 @@ constant_cycles_kernel: constant_cycles_kernel.cu testbench.h
 preemption_logger: task_host_utilities.cu preemption_logger.cu testbench.h
        $(NVCC) preemption_logger.cu -o preemption_logger -g
+measure_launch_oh: measure_launch_oh.cu testbench.h
+        $(NVCC) measure_launch_oh.cu -o measure_launch_oh
 copy_experiments:
        $(MAKE) -C $@
 clean:
-        rm -f constant_cycles_kernel preemption_logger
+        rm -f constant_cycles_kernel preemption_logger measure_launch_oh
        $(MAKE) -C copy_experiments clean
diff --git a/measure_launch_oh.cu b/measure_launch_oh.cu
new file mode 100644
index 0000000..981448a
--- /dev/null
+++ b/measure_launch_oh.cu
@@ -0,0 +1,62 @@
+/* Copyright 2024 Joshua Bakita
+ * Simple kernel that clocks how long a kernel launch takes to the GPU.
+ * Prints samples to stdout and summary statistics to stderr.
+ */
+#include <time.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include "testbench.h"
+__global__ void flag_on_gpu(volatile int *flag) {
+        *flag = 1;
+}
+// Get launch overhead
+long measure_launch_overhead() {
+        volatile int *barrier;
+        struct timespec start, end;
+        SAFE(cudaHostAlloc(&barrier, sizeof(*barrier), cudaHostAllocMapped));
+        *barrier = 0;
+        clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+        flag_on_gpu<<<100,100>>>(barrier);
+        while (!*barrier) continue;
+        clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+        SAFE(cudaDeviceSynchronize());
+        return timediff(start, end);
+}
+int main(int argc, char **argv) {
+        int *__unused, i;
+        unsigned long num_iters;
+        long time;
+        long double cumulative_time = 0;
+        if (argc != 2 || !strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) {
+                fprintf(stderr, "Usage: %s [# of samples]\n", argv[0]);
+                return 1;
+        }
+        num_iters = strtoul(argv[1], NULL, 10);
+        // Initialize CUDA and a context (hack)
+        SAFE(cudaMalloc(&__unused, 8));
+        // Run once to ensure the kernel is compiled
+        time = measure_launch_overhead();
+        fprintf(stderr, "(%d) %ld ns (%.2f ms) warmup launch overhead\n", getpid(),
+                time, ns2ms((double)time));
+        // Time several kernel launches
+        for (i = 0; i < num_iters; i++) {
+                time = measure_launch_overhead();
+                cumulative_time += time;
+                // Print one sample per line
+                fprintf(stdout, "%ld\n", time);
+        }
+        fprintf(stderr, "(%d) %.0Lf ns (%.2Lf ms) average launch overhead\n", getpid(),
+                cumulative_time / num_iters, ns2ms(cumulative_time / num_iters));
+        return 0;
+}
author	Joshua Bakita <jbakita@cs.unc.edu>	2024-02-26 20:29:25 -0500
committer	Joshua Bakita <jbakita@cs.unc.edu>	2024-02-26 20:29:25 -0500
commit	340457bd5dae3624d4c92a02b84fb8206c0348a9 (patch)
tree	561baa1d81e9a41b48066f547435e43613106bc4
parent	5c65954998591bc61a4138024ba4895bed64a8a6 (diff)

diff --git a/.gitignore b/.gitignore index b0d597a..7cfda47 100644 --- a/.gitignore +++ b/.gitignore
@@ -1,5 +1,6 @@
1	**/.gdb_history	1	**/.gdb_history
2	preemption_logger	2	preemption_logger
3	constant_cycles_kernel	3	constant_cycles_kernel
		4	measure_launch_oh
4	copy_experiments/copy_contender	5	copy_experiments/copy_contender
5	copy_experiments/mon_cross_ctx_copies	6	copy_experiments/mon_cross_ctx_copies


diff --git a/Makefile b/Makefile index 688d73d..1956d72 100644 --- a/Makefile +++ b/Makefile
@@ -1,7 +1,7 @@
1	.PHONY: all copy_experiments clean	1	.PHONY: all copy_experiments clean
2	NVCC ?= nvcc	2	NVCC ?= nvcc
3		3
4	all: constant_cycles_kernel preemption_logger copy_experiments	4	all: constant_cycles_kernel preemption_logger copy_experiments measure_launch_oh
5		5
6	constant_cycles_kernel: constant_cycles_kernel.cu testbench.h	6	constant_cycles_kernel: constant_cycles_kernel.cu testbench.h
7	$(NVCC) constant_cycles_kernel.cu -o constant_cycles_kernel	7	$(NVCC) constant_cycles_kernel.cu -o constant_cycles_kernel
@@ -9,9 +9,12 @@ constant_cycles_kernel: constant_cycles_kernel.cu testbench.h
9	preemption_logger: task_host_utilities.cu preemption_logger.cu testbench.h	9	preemption_logger: task_host_utilities.cu preemption_logger.cu testbench.h
10	$(NVCC) preemption_logger.cu -o preemption_logger -g	10	$(NVCC) preemption_logger.cu -o preemption_logger -g
11		11
		12	measure_launch_oh: measure_launch_oh.cu testbench.h
		13	$(NVCC) measure_launch_oh.cu -o measure_launch_oh
		14
12	copy_experiments:	15	copy_experiments:
13	$(MAKE) -C $@	16	$(MAKE) -C $@
14		17
15	clean:	18	clean:
16	rm -f constant_cycles_kernel preemption_logger	19	rm -f constant_cycles_kernel preemption_logger measure_launch_oh
17	$(MAKE) -C copy_experiments clean	20	$(MAKE) -C copy_experiments clean


diff --git a/measure_launch_oh.cu b/measure_launch_oh.cu new file mode 100644 index 0000000..981448a --- /dev/null +++ b/measure_launch_oh.cu
@@ -0,0 +1,62 @@
		1	/* Copyright 2024 Joshua Bakita
		2	* Simple kernel that clocks how long a kernel launch takes to the GPU.
		3	* Prints samples to stdout and summary statistics to stderr.
		4	*/
		5	#include <time.h>
		6	#include <stdio.h>
		7	#include <stdint.h>
		8	#include <unistd.h>
		9
		10	#include "testbench.h"
		11
		12	__global__ void flag_on_gpu(volatile int *flag) {
		13	*flag = 1;
		14	}
		15
		16	// Get launch overhead
		17	long measure_launch_overhead() {
		18	volatile int *barrier;
		19	struct timespec start, end;
		20	SAFE(cudaHostAlloc(&barrier, sizeof(*barrier), cudaHostAllocMapped));
		21	*barrier = 0;
		22	clock_gettime(CLOCK_MONOTONIC_RAW, &start);
		23	flag_on_gpu<<<100,100>>>(barrier);
		24	while (!*barrier) continue;
		25	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
		26	SAFE(cudaDeviceSynchronize());
		27	return timediff(start, end);
		28	}
		29
		30	int main(int argc, char **argv) {
		31	int *__unused, i;
		32	unsigned long num_iters;
		33	long time;
		34	long double cumulative_time = 0;
		35
		36	if (argc != 2 \|\| !strcmp(argv[1], "--help") \|\| !strcmp(argv[1], "-h")) {
		37	fprintf(stderr, "Usage: %s [# of samples]\n", argv[0]);
		38	return 1;
		39	}
		40
		41	num_iters = strtoul(argv[1], NULL, 10);
		42
		43	// Initialize CUDA and a context (hack)
		44	SAFE(cudaMalloc(&__unused, 8));
		45
		46	// Run once to ensure the kernel is compiled
		47	time = measure_launch_overhead();
		48	fprintf(stderr, "(%d) %ld ns (%.2f ms) warmup launch overhead\n", getpid(),
		49	time, ns2ms((double)time));
		50
		51	// Time several kernel launches
		52	for (i = 0; i < num_iters; i++) {
		53	time = measure_launch_overhead();
		54	cumulative_time += time;
		55	// Print one sample per line
		56	fprintf(stdout, "%ld\n", time);
		57	}
		58
		59	fprintf(stderr, "(%d) %.0Lf ns (%.2Lf ms) average launch overhead\n", getpid(),
		60	cumulative_time / num_iters, ns2ms(cumulative_time / num_iters));
		61	return 0;
		62	}