1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
#include <error.h>
#include <errno.h>
#include <stdio.h>
#include <stdbool.h>
#include <cuda_runtime.h>
#include "libsmctrl.h"
#include "testbench.h"
__global__ void read_and_store_smid(uint8_t* smid_arr) {
if (threadIdx.x != 1)
return;
int smid;
asm("mov.u32 %0, %%smid;" : "=r"(smid));
smid_arr[blockIdx.x] = smid;
}
// Assuming SMs continue to support a maximum of 2048 resident threads, six
// blocks of 1024 threads should span at least three SMs without partitioning
#define NUM_BLOCKS 6
int sort_asc(const void* a, const void* b) {
return *(uint8_t*)a - *(uint8_t*)b;
}
// Warning: Mutates input array via qsort
int count_unique(uint8_t* arr, int len) {
qsort(arr, len, 1, sort_asc);
int num_uniq = 1;
for (int i = 0; i < len - 1; i++)
num_uniq += (arr[i] != arr[i + 1]);
return num_uniq;
}
int main() {
cudaError_t err; // Needed by SAFE() macro
int res;
uint8_t *smids_native_d, *smids_native_h;
uint8_t *smids_partitioned_d, *smids_partitioned_h;
int uniq_native, uniq_partitioned;
uint32_t num_tpcs;
int num_sms, sms_per_tpc;
// Determine number of SMs per TPC
SAFE(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
if (res = libsmctrl_get_tpc_info_cuda(&num_tpcs, 0))
error(1, res, "libsmctrl_test_global: Unable to get TPC configuration for test");
sms_per_tpc = num_sms/num_tpcs;
// Test baseline (native) behavior without partitioning
SAFE(cudaMalloc(&smids_native_d, NUM_BLOCKS));
if (!(smids_native_h = (uint8_t*)malloc(NUM_BLOCKS)))
error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test");
read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_native_d);
SAFE(cudaMemcpy(smids_native_h, smids_native_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
uniq_native = count_unique(smids_native_h, NUM_BLOCKS);
if (uniq_native < sms_per_tpc) {
printf("libsmctrl_test_global: ***Test failure.***\n"
"libsmctrl_test_global: Reason: In baseline test, %d blocks of 1024 "
"threads were launched on the GPU, but only %d SMs were utilized, "
"when it was expected that at least %d would be used.\n", NUM_BLOCKS,
uniq_native, sms_per_tpc);
return 1;
}
// Verify that partitioning changes the SMID distribution
libsmctrl_set_global_mask(~0x1); // Enable only one TPC
SAFE(cudaMalloc(&smids_partitioned_d, NUM_BLOCKS));
if (!(smids_partitioned_h = (uint8_t*)malloc(NUM_BLOCKS)))
error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test");
read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_partitioned_d);
SAFE(cudaMemcpy(smids_partitioned_h, smids_partitioned_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
// Make sure it only ran on the number of TPCs provided
// May run on up to two SMs, as up to two per TPC
uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS);
if (uniq_partitioned > sms_per_tpc) {
printf("libsmctrl_test_global: ***Test failure.***\n"
"libsmctrl_test_global: Reason: With global TPC mask set to "
"constrain all kernels to a single TPC, a kernel of %d blocks of "
"1024 threads was launched and found to run on %d SMs (at most %d---"
"one TPC---expected).\n", NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
return 1;
}
// Make sure it ran on the right TPC
if (smids_partitioned_h[NUM_BLOCKS - 1] > sms_per_tpc - 1) {
printf("libsmctrl_test_global: ***Test failure.***\n"
"libsmctrl_test_global: Reason: With global TPC mask set to"
"constrain all kernels to the first TPC, a kernel was run and found "
"to run on an SM ID as high as %d (max of %d expected).\n",
smids_partitioned_h[NUM_BLOCKS - 1], sms_per_tpc - 1);
return 1;
}
printf("libsmctrl_test_global: Test passed!\n"
"libsmctrl_test_global: Reason: With a global partition enabled which "
"contained only TPC ID 0, the test kernel was found to use only %d "
"SMs (%d without), and all SMs in-use had IDs below %d (were contained"
" in the first TPC).\n", uniq_partitioned, uniq_native, sms_per_tpc);
return 0;
}
|