aboutsummaryrefslogtreecommitdiffstats
path: root/libsmctrl_test_mask_shared.cu
diff options
context:
space:
mode:
Diffstat (limited to 'libsmctrl_test_mask_shared.cu')
-rw-r--r--libsmctrl_test_mask_shared.cu140
1 files changed, 140 insertions, 0 deletions
diff --git a/libsmctrl_test_mask_shared.cu b/libsmctrl_test_mask_shared.cu
new file mode 100644
index 0000000..c3817fa
--- /dev/null
+++ b/libsmctrl_test_mask_shared.cu
@@ -0,0 +1,140 @@
1// Copyright 2023 Joshua Bakita
2#include <error.h>
3#include <errno.h>
4#include <stdio.h>
5#include <stdbool.h>
6#include <cuda_runtime.h>
7
8#include "libsmctrl.h"
9#include "testbench.h"
10#include "libsmctrl_test_mask_shared.h"
11
12__global__ void read_and_store_smid(uint8_t* smid_arr) {
13 if (threadIdx.x != 1)
14 return;
15 int smid;
16 asm("mov.u32 %0, %%smid;" : "=r"(smid));
17 smid_arr[blockIdx.x] = smid;
18}
19
20// Assuming SMs continue to support a maximum of 2048 resident threads, six
21// blocks of 1024 threads should span at least three SMs without partitioning
22#define NUM_BLOCKS 142 //6
23
24static int sort_asc(const void* a, const void* b) {
25 return *(uint8_t*)a - *(uint8_t*)b;
26}
27
28// Warning: Mutates input array via qsort
29static int count_unique(uint8_t* arr, int len) {
30 qsort(arr, len, 1, sort_asc);
31 int num_uniq = 1;
32 for (int i = 0; i < len - 1; i++)
33 num_uniq += (arr[i] != arr[i + 1]);
34 return num_uniq;
35}
36
37// Test that adding an SM mask:
38// 1. Constrains the number of SMs accessible
39// 2. Constrains an application to the correct subset of SMs
40int test_constrained_size_and_location(enum partitioning_type part_type) {
41 int res;
42 uint8_t *smids_native_d, *smids_native_h;
43 uint8_t *smids_partitioned_d, *smids_partitioned_h;
44 int uniq_native, uniq_partitioned;
45 uint32_t num_tpcs;
46 int num_sms, sms_per_tpc;
47 cudaStream_t stream;
48
49 SAFE(cudaStreamCreate(&stream));
50
51 // Determine number of SMs per TPC
52 SAFE(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
53 if (res = libsmctrl_get_tpc_info_cuda(&num_tpcs, 0))
54 error(1, res, "Unable to get TPC configuration for test");
55 sms_per_tpc = num_sms/num_tpcs;
56
57 // Test baseline (native) behavior without partitioning
58 SAFE(cudaMalloc(&smids_native_d, NUM_BLOCKS));
59 if (!(smids_native_h = (uint8_t*)malloc(NUM_BLOCKS)))
60 error(1, errno, "Unable to allocate memory for test");
61 read_and_store_smid<<<NUM_BLOCKS, 1024, 0, stream>>>(smids_native_d);
62 SAFE(cudaMemcpy(smids_native_h, smids_native_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
63
64 uniq_native = count_unique(smids_native_h, NUM_BLOCKS);
65 if (uniq_native < sms_per_tpc) {
66 printf("%s: ***Test failure.***\n"
67 "%s: Reason: In baseline test, %d blocks of 1024 "
68 "threads were launched on the GPU, but only %d SMs were utilized, "
69 "when it was expected that at least %d would be used.\n", program_invocation_name, program_invocation_name, NUM_BLOCKS,
70 uniq_native, sms_per_tpc);
71 return 1;
72 }
73
74 // Test at 32-TPC boundaries to verify that the mask is applied in the
75 // correct order to each of the QMD/stream struct fields.
76 char* reason[4] = {0};
77 for (int enabled_tpc = 0; enabled_tpc < num_tpcs && enabled_tpc < 128; enabled_tpc += 32) {
78 uint128_t mask = 1;
79 mask <<= enabled_tpc;
80 mask = ~mask;
81
82 // Apply partitioning to enable only the first TPC of each 32-bit block
83 switch (part_type) {
84 case PARTITION_GLOBAL:
85 libsmctrl_set_global_mask(mask);
86 break;
87 case PARTITION_STREAM:
88 libsmctrl_set_stream_mask_ext(stream, mask);
89 break;
90 case PARTITION_NEXT:
91 libsmctrl_set_next_mask(mask);
92 break;
93 default:
94 error(1, 0, "Shared test core called with unrecognized partitioning type.");
95 }
96
97 // Verify that partitioning changes the SMID distribution
98 SAFE(cudaMalloc(&smids_partitioned_d, NUM_BLOCKS));
99 if (!(smids_partitioned_h = (uint8_t*)malloc(NUM_BLOCKS)))
100 error(1, errno, "Unable to allocate memory for test");
101 read_and_store_smid<<<NUM_BLOCKS, 1024, 0, stream>>>(smids_partitioned_d);
102 SAFE(cudaMemcpy(smids_partitioned_h, smids_partitioned_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
103
104 // Make sure it only ran on the number of TPCs provided
105 // May run on up to two SMs, as up to two per TPC
106 uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS); // Sorts too
107 if (uniq_partitioned > sms_per_tpc) {
108 printf("%s: ***Test failure.***\n"
109 "%s: Reason: With TPC mask set to "
110 "constrain all kernels to a single TPC, a kernel of %d blocks of "
111 "1024 threads was launched and found to run on %d SMs (at most %d---"
112 "one TPC---expected).\n", program_invocation_name, program_invocation_name, NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
113 return 1;
114 }
115
116 // Make sure it ran on the right TPC
117 if (smids_partitioned_h[NUM_BLOCKS - 1] > (enabled_tpc * sms_per_tpc) + sms_per_tpc - 1 ||
118 smids_partitioned_h[NUM_BLOCKS - 1] < (enabled_tpc * sms_per_tpc)) {
119 printf("%s: ***Test failure.***\n"
120 "%s: Reason: With TPC mask set to"
121 "constrain all kernels to TPC %d, a kernel was run and found "
122 "to run on an SM IDs: as high as %d and as low as %d (range of %d to %d expected).\n",
123 program_invocation_name, program_invocation_name, enabled_tpc, smids_partitioned_h[NUM_BLOCKS - 1], smids_partitioned_h[0], enabled_tpc * sms_per_tpc + sms_per_tpc - 1, enabled_tpc * sms_per_tpc);
124 return 1;
125 }
126
127 // Div by 32 via a shift
128 asprintf(&reason[enabled_tpc >> 5],
129 "With a partition enabled which "
130 "contained only TPC ID %d, the test kernel was found to use only %d "
131 "SMs (%d without), and all SMs in-use had IDs between %d and %d (were contained"
132 " in TPC %d).", enabled_tpc, uniq_partitioned, uniq_native, smids_partitioned_h[0], smids_partitioned_h[NUM_BLOCKS - 1], enabled_tpc);
133 }
134
135 printf("%s: Test passed!\n", program_invocation_name);
136 for (int i = 0; i < 4 && reason[i]; i++)
137 printf("%s: Reason %d: %s\n", program_invocation_name, i + 1, reason[i]);
138 return 0;
139}
140