diff options
author | Joshua Bakita <jbakita@cs.unc.edu> | 2024-02-26 20:29:25 -0500 |
---|---|---|
committer | Joshua Bakita <jbakita@cs.unc.edu> | 2024-02-26 20:29:25 -0500 |
commit | 340457bd5dae3624d4c92a02b84fb8206c0348a9 (patch) | |
tree | 561baa1d81e9a41b48066f547435e43613106bc4 | |
parent | 5c65954998591bc61a4138024ba4895bed64a8a6 (diff) |
Add measure_launch_oh benchmark and make target
Benchmark supports:
=> Warms up the GPU and kernel code before sampling
=> Uses a flag in zero-copy memory to track when a kernel begins
=> Outputs summary statistics to stderr, and sample data to stdout
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Makefile | 7 | ||||
-rw-r--r-- | measure_launch_oh.cu | 62 |
3 files changed, 68 insertions, 2 deletions
@@ -1,5 +1,6 @@ | |||
1 | **/.gdb_history | 1 | **/.gdb_history |
2 | preemption_logger | 2 | preemption_logger |
3 | constant_cycles_kernel | 3 | constant_cycles_kernel |
4 | measure_launch_oh | ||
4 | copy_experiments/copy_contender | 5 | copy_experiments/copy_contender |
5 | copy_experiments/mon_cross_ctx_copies | 6 | copy_experiments/mon_cross_ctx_copies |
@@ -1,7 +1,7 @@ | |||
1 | .PHONY: all copy_experiments clean | 1 | .PHONY: all copy_experiments clean |
2 | NVCC ?= nvcc | 2 | NVCC ?= nvcc |
3 | 3 | ||
4 | all: constant_cycles_kernel preemption_logger copy_experiments | 4 | all: constant_cycles_kernel preemption_logger copy_experiments measure_launch_oh |
5 | 5 | ||
6 | constant_cycles_kernel: constant_cycles_kernel.cu testbench.h | 6 | constant_cycles_kernel: constant_cycles_kernel.cu testbench.h |
7 | $(NVCC) constant_cycles_kernel.cu -o constant_cycles_kernel | 7 | $(NVCC) constant_cycles_kernel.cu -o constant_cycles_kernel |
@@ -9,9 +9,12 @@ constant_cycles_kernel: constant_cycles_kernel.cu testbench.h | |||
9 | preemption_logger: task_host_utilities.cu preemption_logger.cu testbench.h | 9 | preemption_logger: task_host_utilities.cu preemption_logger.cu testbench.h |
10 | $(NVCC) preemption_logger.cu -o preemption_logger -g | 10 | $(NVCC) preemption_logger.cu -o preemption_logger -g |
11 | 11 | ||
12 | measure_launch_oh: measure_launch_oh.cu testbench.h | ||
13 | $(NVCC) measure_launch_oh.cu -o measure_launch_oh | ||
14 | |||
12 | copy_experiments: | 15 | copy_experiments: |
13 | $(MAKE) -C $@ | 16 | $(MAKE) -C $@ |
14 | 17 | ||
15 | clean: | 18 | clean: |
16 | rm -f constant_cycles_kernel preemption_logger | 19 | rm -f constant_cycles_kernel preemption_logger measure_launch_oh |
17 | $(MAKE) -C copy_experiments clean | 20 | $(MAKE) -C copy_experiments clean |
diff --git a/measure_launch_oh.cu b/measure_launch_oh.cu new file mode 100644 index 0000000..981448a --- /dev/null +++ b/measure_launch_oh.cu | |||
@@ -0,0 +1,62 @@ | |||
1 | /* Copyright 2024 Joshua Bakita | ||
2 | * Simple kernel that clocks how long a kernel launch takes to the GPU. | ||
3 | * Prints samples to stdout and summary statistics to stderr. | ||
4 | */ | ||
5 | #include <time.h> | ||
6 | #include <stdio.h> | ||
7 | #include <stdint.h> | ||
8 | #include <unistd.h> | ||
9 | |||
10 | #include "testbench.h" | ||
11 | |||
12 | __global__ void flag_on_gpu(volatile int *flag) { | ||
13 | *flag = 1; | ||
14 | } | ||
15 | |||
16 | // Get launch overhead | ||
17 | long measure_launch_overhead() { | ||
18 | volatile int *barrier; | ||
19 | struct timespec start, end; | ||
20 | SAFE(cudaHostAlloc(&barrier, sizeof(*barrier), cudaHostAllocMapped)); | ||
21 | *barrier = 0; | ||
22 | clock_gettime(CLOCK_MONOTONIC_RAW, &start); | ||
23 | flag_on_gpu<<<100,100>>>(barrier); | ||
24 | while (!*barrier) continue; | ||
25 | clock_gettime(CLOCK_MONOTONIC_RAW, &end); | ||
26 | SAFE(cudaDeviceSynchronize()); | ||
27 | return timediff(start, end); | ||
28 | } | ||
29 | |||
30 | int main(int argc, char **argv) { | ||
31 | int *__unused, i; | ||
32 | unsigned long num_iters; | ||
33 | long time; | ||
34 | long double cumulative_time = 0; | ||
35 | |||
36 | if (argc != 2 || !strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) { | ||
37 | fprintf(stderr, "Usage: %s [# of samples]\n", argv[0]); | ||
38 | return 1; | ||
39 | } | ||
40 | |||
41 | num_iters = strtoul(argv[1], NULL, 10); | ||
42 | |||
43 | // Initialize CUDA and a context (hack) | ||
44 | SAFE(cudaMalloc(&__unused, 8)); | ||
45 | |||
46 | // Run once to ensure the kernel is compiled | ||
47 | time = measure_launch_overhead(); | ||
48 | fprintf(stderr, "(%d) %ld ns (%.2f ms) warmup launch overhead\n", getpid(), | ||
49 | time, ns2ms((double)time)); | ||
50 | |||
51 | // Time several kernel launches | ||
52 | for (i = 0; i < num_iters; i++) { | ||
53 | time = measure_launch_overhead(); | ||
54 | cumulative_time += time; | ||
55 | // Print one sample per line | ||
56 | fprintf(stdout, "%ld\n", time); | ||
57 | } | ||
58 | |||
59 | fprintf(stderr, "(%d) %.0Lf ns (%.2Lf ms) average launch overhead\n", getpid(), | ||
60 | cumulative_time / num_iters, ns2ms(cumulative_time / num_iters)); | ||
61 | return 0; | ||
62 | } | ||