diff options
-rw-r--r-- | Makefile | 14 | ||||
-rw-r--r-- | libsmctrl_test_global_mask.cu | 103 |
2 files changed, 113 insertions, 4 deletions
@@ -1,4 +1,5 @@ | |||
1 | CC = gcc | 1 | CC = gcc |
2 | NVCC ?= nvcc | ||
2 | # -fPIC is needed in all cases, as we may be linked into another shared library | 3 | # -fPIC is needed in all cases, as we may be linked into another shared library |
3 | CFLAGS = -fPIC | 4 | CFLAGS = -fPIC |
4 | LDFLAGS = -lcuda -I/usr/local/cuda/include | 5 | LDFLAGS = -lcuda -I/usr/local/cuda/include |
@@ -12,10 +13,15 @@ libsmctrl.a: libsmctrl.c libsmctrl.h | |||
12 | $(CC) $< -c -o libsmctrl.o $(CFLAGS) $(LDFLAGS) | 13 | $(CC) $< -c -o libsmctrl.o $(CFLAGS) $(LDFLAGS) |
13 | ar rcs $@ libsmctrl.o | 14 | ar rcs $@ libsmctrl.o |
14 | 15 | ||
15 | libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c | 16 | # Use static linking with tests to avoid LD_LIBRARY_PATH issues |
16 | $(CC) $< -o $@ -L. -lsmctrl $(LDFLAGS) | 17 | libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c libsmctrl.a |
18 | $(CC) $< -o $@ -g -L. -l:libsmctrl.a $(LDFLAGS) | ||
17 | 19 | ||
18 | tests: libsmctrl_test_gpc_info | 20 | libsmctrl_test_global_mask: libsmctrl_test_global_mask.cu libsmctrl.a |
21 | $(NVCC) $< -o $@ -g -L. -l:libsmctrl.a $(LDFLAGS) | ||
22 | |||
23 | tests: libsmctrl_test_gpc_info libsmctrl_test_global_mask | ||
19 | 24 | ||
20 | clean: | 25 | clean: |
21 | rm -f libsmctrl.so libsmctrl.a | 26 | rm -f libsmctrl.so libsmctrl.a libsmctrl_test_gpu_info \ |
27 | libsmctrl_test_global_mask | ||
diff --git a/libsmctrl_test_global_mask.cu b/libsmctrl_test_global_mask.cu new file mode 100644 index 0000000..f6a487f --- /dev/null +++ b/libsmctrl_test_global_mask.cu | |||
@@ -0,0 +1,103 @@ | |||
1 | #include <error.h> | ||
2 | #include <errno.h> | ||
3 | #include <stdio.h> | ||
4 | #include <stdbool.h> | ||
5 | #include <cuda_runtime.h> | ||
6 | |||
7 | #include "libsmctrl.h" | ||
8 | #include "testbench.h" | ||
9 | |||
10 | __global__ void read_and_store_smid(uint8_t* smid_arr) { | ||
11 | if (threadIdx.x != 1) | ||
12 | return; | ||
13 | int smid; | ||
14 | asm("mov.u32 %0, %%smid;" : "=r"(smid)); | ||
15 | smid_arr[blockIdx.x] = smid; | ||
16 | } | ||
17 | |||
18 | // Assuming SMs continue to support a maximum of 2048 resident threads, six | ||
19 | // blocks of 1024 threads should span at least three SMs without partitioning | ||
20 | #define NUM_BLOCKS 6 | ||
21 | |||
22 | int sort_asc(const void* a, const void* b) { | ||
23 | return *(uint8_t*)a - *(uint8_t*)b; | ||
24 | } | ||
25 | |||
26 | // Warning: Mutates input array via qsort | ||
27 | int count_unique(uint8_t* arr, int len) { | ||
28 | qsort(arr, len, 1, sort_asc); | ||
29 | int num_uniq = 1; | ||
30 | for (int i = 0; i < len - 1; i++) | ||
31 | num_uniq += (arr[i] != arr[i + 1]); | ||
32 | return num_uniq; | ||
33 | } | ||
34 | |||
35 | int main() { | ||
36 | cudaError_t err; // Needed by SAFE() macro | ||
37 | int res; | ||
38 | uint8_t *smids_native_d, *smids_native_h; | ||
39 | uint8_t *smids_partitioned_d, *smids_partitioned_h; | ||
40 | int uniq_native, uniq_partitioned; | ||
41 | uint32_t num_tpcs; | ||
42 | int num_sms, sms_per_tpc; | ||
43 | |||
44 | // Determine number of SMs per TPC | ||
45 | SAFE(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0)); | ||
46 | if (res = libsmctrl_get_tpc_info_cuda(&num_tpcs, 0)) | ||
47 | error(1, res, "libsmctrl_test_global: Unable to get TPC configuration for test"); | ||
48 | sms_per_tpc = num_sms/num_tpcs; | ||
49 | |||
50 | // Test baseline (native) behavior without partitioning | ||
51 | SAFE(cudaMalloc(&smids_native_d, NUM_BLOCKS)); | ||
52 | if (!(smids_native_h = (uint8_t*)malloc(NUM_BLOCKS))) | ||
53 | error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test"); | ||
54 | read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_native_d); | ||
55 | SAFE(cudaMemcpy(smids_native_h, smids_native_d, NUM_BLOCKS, cudaMemcpyDeviceToHost)); | ||
56 | |||
57 | uniq_native = count_unique(smids_native_h, NUM_BLOCKS); | ||
58 | if (uniq_native < sms_per_tpc) { | ||
59 | printf("libsmctrl_test_global: ***Test failure.***\n" | ||
60 | "libsmctrl_test_global: Reason: In baseline test, %d blocks of 1024 " | ||
61 | "threads were launched on the GPU, but only %d SMs were utilized, " | ||
62 | "when it was expected that at least %d would be used.\n", NUM_BLOCKS, | ||
63 | uniq_native, sms_per_tpc); | ||
64 | return 1; | ||
65 | } | ||
66 | |||
67 | // Verify that partitioning changes the SMID distribution | ||
68 | libsmctrl_set_global_mask(~0x1); // Enable only one TPC | ||
69 | SAFE(cudaMalloc(&smids_partitioned_d, NUM_BLOCKS)); | ||
70 | if (!(smids_partitioned_h = (uint8_t*)malloc(NUM_BLOCKS))) | ||
71 | error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test"); | ||
72 | read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_partitioned_d); | ||
73 | SAFE(cudaMemcpy(smids_partitioned_h, smids_partitioned_d, NUM_BLOCKS, cudaMemcpyDeviceToHost)); | ||
74 | |||
75 | // Make sure it only ran on the number of TPCs provided | ||
76 | // May run on up to two SMs, as up to two per TPC | ||
77 | uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS); | ||
78 | if (uniq_partitioned > sms_per_tpc) { | ||
79 | printf("libsmctrl_test_global: ***Test failure.***\n" | ||
80 | "libsmctrl_test_global: Reason: With global TPC mask set to " | ||
81 | "constrain all kernels to a single TPC, a kernel of %d blocks of " | ||
82 | "1024 threads was launched and found to run on %d SMs (at most %d---" | ||
83 | "one TPC---expected).\n", NUM_BLOCKS, uniq_partitioned, sms_per_tpc); | ||
84 | return 1; | ||
85 | } | ||
86 | |||
87 | // Make sure it ran on the right TPC | ||
88 | if (smids_partitioned_h[NUM_BLOCKS - 1] > sms_per_tpc - 1) { | ||
89 | printf("libsmctrl_test_global: ***Test failure.***\n" | ||
90 | "libsmctrl_test_global: Reason: With global TPC mask set to" | ||
91 | "constrain all kernels to the first TPC, a kernel was run and found " | ||
92 | "to run on an SM ID as high as %d (max of %d expected).\n", | ||
93 | smids_partitioned_h[NUM_BLOCKS - 1], sms_per_tpc - 1); | ||
94 | return 1; | ||
95 | } | ||
96 | |||
97 | printf("libsmctrl_test_global: Test passed!\n" | ||
98 | "libsmctrl_test_global: Reason: With a global partition enabled which " | ||
99 | "contained only TPC ID 0, the test kernel was found to use only %d " | ||
100 | "SMs (%d without), and all SMs in-use had IDs below %d (were contained" | ||
101 | " in the first TPC).\n", uniq_partitioned, uniq_native, sms_per_tpc); | ||
102 | return 0; | ||
103 | } | ||