aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <jbakita@cs.unc.edu>2023-10-17 15:32:51 -0400
committerJoshua Bakita <jbakita@cs.unc.edu>2023-10-17 15:32:51 -0400
commitaba56610404c90143f7837aadfd19b769caf5460 (patch)
tree7cfd9598701c415cee78cc5ff60b748b43c68897
parent2c4b2d784815c5a2b4c49592b912c043f3d2a954 (diff)
Add test for libsmctrl_set_global_mask()
Also use static linking for tests, to avoid a need to set LD_LIBRARY_PATH to include the libsmctrl directory.
-rw-r--r--Makefile14
-rw-r--r--libsmctrl_test_global_mask.cu103
2 files changed, 113 insertions, 4 deletions
diff --git a/Makefile b/Makefile
index aa59792..cfbd971 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,5 @@
1CC = gcc 1CC = gcc
2NVCC ?= nvcc
2# -fPIC is needed in all cases, as we may be linked into another shared library 3# -fPIC is needed in all cases, as we may be linked into another shared library
3CFLAGS = -fPIC 4CFLAGS = -fPIC
4LDFLAGS = -lcuda -I/usr/local/cuda/include 5LDFLAGS = -lcuda -I/usr/local/cuda/include
@@ -12,10 +13,15 @@ libsmctrl.a: libsmctrl.c libsmctrl.h
12 $(CC) $< -c -o libsmctrl.o $(CFLAGS) $(LDFLAGS) 13 $(CC) $< -c -o libsmctrl.o $(CFLAGS) $(LDFLAGS)
13 ar rcs $@ libsmctrl.o 14 ar rcs $@ libsmctrl.o
14 15
15libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c 16# Use static linking with tests to avoid LD_LIBRARY_PATH issues
16 $(CC) $< -o $@ -L. -lsmctrl $(LDFLAGS) 17libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c libsmctrl.a
18 $(CC) $< -o $@ -g -L. -l:libsmctrl.a $(LDFLAGS)
17 19
18tests: libsmctrl_test_gpc_info 20libsmctrl_test_global_mask: libsmctrl_test_global_mask.cu libsmctrl.a
21 $(NVCC) $< -o $@ -g -L. -l:libsmctrl.a $(LDFLAGS)
22
23tests: libsmctrl_test_gpc_info libsmctrl_test_global_mask
19 24
20clean: 25clean:
21 rm -f libsmctrl.so libsmctrl.a 26 rm -f libsmctrl.so libsmctrl.a libsmctrl_test_gpu_info \
27 libsmctrl_test_global_mask
diff --git a/libsmctrl_test_global_mask.cu b/libsmctrl_test_global_mask.cu
new file mode 100644
index 0000000..f6a487f
--- /dev/null
+++ b/libsmctrl_test_global_mask.cu
@@ -0,0 +1,103 @@
1#include <error.h>
2#include <errno.h>
3#include <stdio.h>
4#include <stdbool.h>
5#include <cuda_runtime.h>
6
7#include "libsmctrl.h"
8#include "testbench.h"
9
10__global__ void read_and_store_smid(uint8_t* smid_arr) {
11 if (threadIdx.x != 1)
12 return;
13 int smid;
14 asm("mov.u32 %0, %%smid;" : "=r"(smid));
15 smid_arr[blockIdx.x] = smid;
16}
17
18// Assuming SMs continue to support a maximum of 2048 resident threads, six
19// blocks of 1024 threads should span at least three SMs without partitioning
20#define NUM_BLOCKS 6
21
22int sort_asc(const void* a, const void* b) {
23 return *(uint8_t*)a - *(uint8_t*)b;
24}
25
26// Warning: Mutates input array via qsort
27int count_unique(uint8_t* arr, int len) {
28 qsort(arr, len, 1, sort_asc);
29 int num_uniq = 1;
30 for (int i = 0; i < len - 1; i++)
31 num_uniq += (arr[i] != arr[i + 1]);
32 return num_uniq;
33}
34
35int main() {
36 cudaError_t err; // Needed by SAFE() macro
37 int res;
38 uint8_t *smids_native_d, *smids_native_h;
39 uint8_t *smids_partitioned_d, *smids_partitioned_h;
40 int uniq_native, uniq_partitioned;
41 uint32_t num_tpcs;
42 int num_sms, sms_per_tpc;
43
44 // Determine number of SMs per TPC
45 SAFE(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
46 if (res = libsmctrl_get_tpc_info_cuda(&num_tpcs, 0))
47 error(1, res, "libsmctrl_test_global: Unable to get TPC configuration for test");
48 sms_per_tpc = num_sms/num_tpcs;
49
50 // Test baseline (native) behavior without partitioning
51 SAFE(cudaMalloc(&smids_native_d, NUM_BLOCKS));
52 if (!(smids_native_h = (uint8_t*)malloc(NUM_BLOCKS)))
53 error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test");
54 read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_native_d);
55 SAFE(cudaMemcpy(smids_native_h, smids_native_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
56
57 uniq_native = count_unique(smids_native_h, NUM_BLOCKS);
58 if (uniq_native < sms_per_tpc) {
59 printf("libsmctrl_test_global: ***Test failure.***\n"
60 "libsmctrl_test_global: Reason: In baseline test, %d blocks of 1024 "
61 "threads were launched on the GPU, but only %d SMs were utilized, "
62 "when it was expected that at least %d would be used.\n", NUM_BLOCKS,
63 uniq_native, sms_per_tpc);
64 return 1;
65 }
66
67 // Verify that partitioning changes the SMID distribution
68 libsmctrl_set_global_mask(~0x1); // Enable only one TPC
69 SAFE(cudaMalloc(&smids_partitioned_d, NUM_BLOCKS));
70 if (!(smids_partitioned_h = (uint8_t*)malloc(NUM_BLOCKS)))
71 error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test");
72 read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_partitioned_d);
73 SAFE(cudaMemcpy(smids_partitioned_h, smids_partitioned_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
74
75 // Make sure it only ran on the number of TPCs provided
76 // May run on up to two SMs, as up to two per TPC
77 uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS);
78 if (uniq_partitioned > sms_per_tpc) {
79 printf("libsmctrl_test_global: ***Test failure.***\n"
80 "libsmctrl_test_global: Reason: With global TPC mask set to "
81 "constrain all kernels to a single TPC, a kernel of %d blocks of "
82 "1024 threads was launched and found to run on %d SMs (at most %d---"
83 "one TPC---expected).\n", NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
84 return 1;
85 }
86
87 // Make sure it ran on the right TPC
88 if (smids_partitioned_h[NUM_BLOCKS - 1] > sms_per_tpc - 1) {
89 printf("libsmctrl_test_global: ***Test failure.***\n"
90 "libsmctrl_test_global: Reason: With global TPC mask set to"
91 "constrain all kernels to the first TPC, a kernel was run and found "
92 "to run on an SM ID as high as %d (max of %d expected).\n",
93 smids_partitioned_h[NUM_BLOCKS - 1], sms_per_tpc - 1);
94 return 1;
95 }
96
97 printf("libsmctrl_test_global: Test passed!\n"
98 "libsmctrl_test_global: Reason: With a global partition enabled which "
99 "contained only TPC ID 0, the test kernel was found to use only %d "
100 "SMs (%d without), and all SMs in-use had IDs below %d (were contained"
101 " in the first TPC).\n", uniq_partitioned, uniq_native, sms_per_tpc);
102 return 0;
103}