summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <jbakita@cs.unc.edu>2024-02-26 20:29:25 -0500
committerJoshua Bakita <jbakita@cs.unc.edu>2024-02-26 20:29:25 -0500
commit340457bd5dae3624d4c92a02b84fb8206c0348a9 (patch)
tree561baa1d81e9a41b48066f547435e43613106bc4
parent5c65954998591bc61a4138024ba4895bed64a8a6 (diff)
Add measure_launch_oh benchmark and make target
Benchmark supports: => Warms up the GPU and kernel code before sampling => Uses a flag in zero-copy memory to track when a kernel begins => Outputs summary statistics to stderr, and sample data to stdout
-rw-r--r--.gitignore1
-rw-r--r--Makefile7
-rw-r--r--measure_launch_oh.cu62
3 files changed, 68 insertions, 2 deletions
diff --git a/.gitignore b/.gitignore
index b0d597a..7cfda47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
1**/.gdb_history 1**/.gdb_history
2preemption_logger 2preemption_logger
3constant_cycles_kernel 3constant_cycles_kernel
4measure_launch_oh
4copy_experiments/copy_contender 5copy_experiments/copy_contender
5copy_experiments/mon_cross_ctx_copies 6copy_experiments/mon_cross_ctx_copies
diff --git a/Makefile b/Makefile
index 688d73d..1956d72 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
1.PHONY: all copy_experiments clean 1.PHONY: all copy_experiments clean
2NVCC ?= nvcc 2NVCC ?= nvcc
3 3
4all: constant_cycles_kernel preemption_logger copy_experiments 4all: constant_cycles_kernel preemption_logger copy_experiments measure_launch_oh
5 5
6constant_cycles_kernel: constant_cycles_kernel.cu testbench.h 6constant_cycles_kernel: constant_cycles_kernel.cu testbench.h
7 $(NVCC) constant_cycles_kernel.cu -o constant_cycles_kernel 7 $(NVCC) constant_cycles_kernel.cu -o constant_cycles_kernel
@@ -9,9 +9,12 @@ constant_cycles_kernel: constant_cycles_kernel.cu testbench.h
9preemption_logger: task_host_utilities.cu preemption_logger.cu testbench.h 9preemption_logger: task_host_utilities.cu preemption_logger.cu testbench.h
10 $(NVCC) preemption_logger.cu -o preemption_logger -g 10 $(NVCC) preemption_logger.cu -o preemption_logger -g
11 11
12measure_launch_oh: measure_launch_oh.cu testbench.h
13 $(NVCC) measure_launch_oh.cu -o measure_launch_oh
14
12copy_experiments: 15copy_experiments:
13 $(MAKE) -C $@ 16 $(MAKE) -C $@
14 17
15clean: 18clean:
16 rm -f constant_cycles_kernel preemption_logger 19 rm -f constant_cycles_kernel preemption_logger measure_launch_oh
17 $(MAKE) -C copy_experiments clean 20 $(MAKE) -C copy_experiments clean
diff --git a/measure_launch_oh.cu b/measure_launch_oh.cu
new file mode 100644
index 0000000..981448a
--- /dev/null
+++ b/measure_launch_oh.cu
@@ -0,0 +1,62 @@
1/* Copyright 2024 Joshua Bakita
2 * Simple kernel that clocks how long a kernel launch takes to the GPU.
3 * Prints samples to stdout and summary statistics to stderr.
4 */
5#include <time.h>
6#include <stdio.h>
7#include <stdint.h>
8#include <unistd.h>
9
10#include "testbench.h"
11
12__global__ void flag_on_gpu(volatile int *flag) {
13 *flag = 1;
14}
15
16// Get launch overhead
17long measure_launch_overhead() {
18 volatile int *barrier;
19 struct timespec start, end;
20 SAFE(cudaHostAlloc(&barrier, sizeof(*barrier), cudaHostAllocMapped));
21 *barrier = 0;
22 clock_gettime(CLOCK_MONOTONIC_RAW, &start);
23 flag_on_gpu<<<100,100>>>(barrier);
24 while (!*barrier) continue;
25 clock_gettime(CLOCK_MONOTONIC_RAW, &end);
26 SAFE(cudaDeviceSynchronize());
27 return timediff(start, end);
28}
29
30int main(int argc, char **argv) {
31 int *__unused, i;
32 unsigned long num_iters;
33 long time;
34 long double cumulative_time = 0;
35
36 if (argc != 2 || !strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) {
37 fprintf(stderr, "Usage: %s [# of samples]\n", argv[0]);
38 return 1;
39 }
40
41 num_iters = strtoul(argv[1], NULL, 10);
42
43 // Initialize CUDA and a context (hack)
44 SAFE(cudaMalloc(&__unused, 8));
45
46 // Run once to ensure the kernel is compiled
47 time = measure_launch_overhead();
48 fprintf(stderr, "(%d) %ld ns (%.2f ms) warmup launch overhead\n", getpid(),
49 time, ns2ms((double)time));
50
51 // Time several kernel launches
52 for (i = 0; i < num_iters; i++) {
53 time = measure_launch_overhead();
54 cumulative_time += time;
55 // Print one sample per line
56 fprintf(stdout, "%ld\n", time);
57 }
58
59 fprintf(stderr, "(%d) %.0Lf ns (%.2Lf ms) average launch overhead\n", getpid(),
60 cumulative_time / num_iters, ns2ms(cumulative_time / num_iters));
61 return 0;
62}