aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2023-11-29 18:00:31 -0500
committerJoshua Bakita <bakitajoshua@gmail.com>2023-11-29 18:24:25 -0500
commit8062646a185baa6d3934d1e19743ac671e943fa8 (patch)
treee00b4a15b143262c6a5865393cd8a6d94116d50b
parent3c075c8f71a7c85d735018143fc13a6eb91813eb (diff)
Add a README and tests for stream masking and next masking
Also rewrite the global masking test to be much more thorough.
-rw-r--r--.gitignore5
-rw-r--r--Makefile20
-rw-r--r--README.md80
-rw-r--r--libsmctrl_test_global_mask.c10
-rw-r--r--libsmctrl_test_global_mask.cu103
-rw-r--r--libsmctrl_test_mask_shared.cu140
-rw-r--r--libsmctrl_test_mask_shared.h16
-rw-r--r--libsmctrl_test_next_mask.c10
-rw-r--r--libsmctrl_test_stream_mask.c10
-rw-r--r--testbench.h31
10 files changed, 317 insertions, 108 deletions
diff --git a/.gitignore b/.gitignore
index 437f923..553e0fc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,9 @@ libsmctrl.a
2libsmctrl.o 2libsmctrl.o
3libsmctrl.so 3libsmctrl.so
4libsmctrl_test_gpc_info 4libsmctrl_test_gpc_info
5libsmctrl_test_global_mask
6libsmctrl_test_stream_mask
7libsmctrl_test_next_mask
5*.pyc 8*.pyc
9*.o
10.gdb_history
diff --git a/Makefile b/Makefile
index 6abc124..b91f6b9 100644
--- a/Makefile
+++ b/Makefile
@@ -14,14 +14,24 @@ libsmctrl.a: libsmctrl.c libsmctrl.h
14 ar rcs $@ libsmctrl.o 14 ar rcs $@ libsmctrl.o
15 15
16# Use static linking with tests to avoid LD_LIBRARY_PATH issues 16# Use static linking with tests to avoid LD_LIBRARY_PATH issues
17libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c libsmctrl.a 17libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c libsmctrl.a testbench.h
18 $(CC) $< -o $@ -g -L. -l:libsmctrl.a $(LDFLAGS) 18 $(CC) $< -o $@ -g -L. -l:libsmctrl.a $(LDFLAGS)
19 19
20libsmctrl_test_global_mask: libsmctrl_test_global_mask.cu libsmctrl.a 20libsmctrl_test_mask_shared.o: libsmctrl_test_mask_shared.cu testbench.h
21 $(NVCC) $< -o $@ -g -L. -l:libsmctrl.a $(LDFLAGS) 21 $(NVCC) $< -c -g
22 22
23tests: libsmctrl_test_gpc_info libsmctrl_test_global_mask 23libsmctrl_test_global_mask: libsmctrl_test_global_mask.c libsmctrl.a libsmctrl_test_mask_shared.o
24 $(NVCC) $@.c -o $@ libsmctrl_test_mask_shared.o -g -L. -l:libsmctrl.a $(LDFLAGS)
25
26libsmctrl_test_stream_mask: libsmctrl_test_stream_mask.c libsmctrl.a libsmctrl_test_mask_shared.o
27 $(NVCC) $@.c -o $@ libsmctrl_test_mask_shared.o -g -L. -l:libsmctrl.a $(LDFLAGS)
28
29libsmctrl_test_next_mask: libsmctrl_test_next_mask.c libsmctrl.a libsmctrl_test_mask_shared.o
30 $(NVCC) $@.c -o $@ libsmctrl_test_mask_shared.o -g -L. -l:libsmctrl.a $(LDFLAGS)
31
32tests: libsmctrl_test_gpc_info libsmctrl_test_global_mask libsmctrl_test_stream_mask libsmctrl_test_next_mask
24 33
25clean: 34clean:
26 rm -f libsmctrl.so libsmctrl.a libsmctrl_test_gpu_info \ 35 rm -f libsmctrl.so libsmctrl.a libsmctrl_test_gpu_info \
27 libsmctrl_test_global_mask 36 libsmctrl_test_mask_shared.o libmsctrl_test_global_mask \
37 libsmctrl_test_stream_mask libmsctrl_test_next_mask
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..705f2b6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,80 @@
1# libsmctrl: Quick & Easy Hardware Compute Partitioning on NVIDIA GPUs
2
3This library was developed as part of the following paper:
4
5_J. Bakita and J. H. Anderson, "Hardware Compute Partitioning on NVIDIA GPUs", Proceedings of the 29th IEEE Real-Time and Embedded Technology and Applications Symposium, pp. 54-66, May 2023._
6
7Please cite this paper in any work which leverages our library. Here's the BibTeX entry:
8```
9@inproceedings{bakita2023hardware,
10 title={Hardware Compute Partitioning on {NVIDIA} {GPUs}},
11 author={Bakita, Joshua and Anderson, James H},
12 booktitle={Proceedings of the 29th IEEE Real-Time and Embedded Technology and Applications Symposium},
13 year={2023},
14 month={May},
15 pages={54--66},
16 _series={RTAS}
17}
18```
19
20Please see [the paper](https://www.cs.unc.edu/~jbakita/rtas23.pdf) and libsmctrl.h for details and examples of how to use this library.
21We strongly encourage consulting those resources first; the below comments serve merely as an appendum.
22
23## Run Tests
24To test partitioning:
25```
26make tests
27./libsmctrl_test_global_mask
28./libsmctrl_test_stream_mask
29./libsmctrl_test_next_mask
30```
31
32And if `nvdebug` has been installed:
33```
34./libsmctrl_test_gpu_info
35```
36
37## Supported GPUs
38
39#### Known Working
40
41- NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs
42- CUDA 8.1 through 12.2
43- `x86_64` and Jetson `aarch64` platforms
44
45#### Known Issues
46
47- `next_mask` will not override `stream_mask` on CUDA 12.0+
48- `global_mask` and `next_mask` cannot disable TPCs with IDs above 128
49 - Only relevant on GPUs with over 128 TPCs, such as the RTX 6000 Ada
50- Untested on H100 (compute capability 9.0)
51- Untested on non-Jetson `aarch64` platforms
52
53## Important Limitations
54
551. Only supports partitioning _within_ a single GPU context.
56 At time of writing, it is challenging to impossible to share a GPU context across multiple CPU address spaces.
57 The implication is that your applications must first be combined together into a single CPU process.
582. No aspect of this system prevents implicit synchronization on the GPU.
59 See prior work, particularly that of Amert et al. (perhaps the CUPiD^RT paper), for ways to avoid this.
60
61## Porting to New Architectures
62
63Build the tests with `make tests`. And then run the following:
64```
65for (( i=0; $?!=0; i+=8 )); do MASK_OFF=$i ./libsmctrl_test_stream_mask; done
66```
67
68How this works:
69
701. If `MASK_OFF` is set, `libsmctrl` applies this as a byte offset to a base address for the location
71 of the SM mask fields in CUDA's stream data structure.
72 - That base address is the one for CUDA 12.2 at time of writing
732. The stream masking test is run.
743. If the test succeeded (returned zero) the loop aborts, otherwise it increments the offset to attempt and repeats.
75
76Once this loop aborts, take the found offset and add it into the switch statement for the appropriate CUDA version and CPU architecture.
77
78## TODO
79
80- Add a test to check that more-granularly-set compute masks override more-corsely-set ones.
diff --git a/libsmctrl_test_global_mask.c b/libsmctrl_test_global_mask.c
new file mode 100644
index 0000000..9e3189b
--- /dev/null
+++ b/libsmctrl_test_global_mask.c
@@ -0,0 +1,10 @@
1// Copyright 2023 Joshua Bakita
2// Test libsmctrl_set_global_mask().
3// All types of partitioning use the same test, so this file is trival.
4
5#include "libsmctrl_test_mask_shared.h"
6
7int main() {
8 return test_constrained_size_and_location(PARTITION_GLOBAL);
9}
10
diff --git a/libsmctrl_test_global_mask.cu b/libsmctrl_test_global_mask.cu
deleted file mode 100644
index f6a487f..0000000
--- a/libsmctrl_test_global_mask.cu
+++ /dev/null
@@ -1,103 +0,0 @@
1#include <error.h>
2#include <errno.h>
3#include <stdio.h>
4#include <stdbool.h>
5#include <cuda_runtime.h>
6
7#include "libsmctrl.h"
8#include "testbench.h"
9
10__global__ void read_and_store_smid(uint8_t* smid_arr) {
11 if (threadIdx.x != 1)
12 return;
13 int smid;
14 asm("mov.u32 %0, %%smid;" : "=r"(smid));
15 smid_arr[blockIdx.x] = smid;
16}
17
18// Assuming SMs continue to support a maximum of 2048 resident threads, six
19// blocks of 1024 threads should span at least three SMs without partitioning
20#define NUM_BLOCKS 6
21
22int sort_asc(const void* a, const void* b) {
23 return *(uint8_t*)a - *(uint8_t*)b;
24}
25
26// Warning: Mutates input array via qsort
27int count_unique(uint8_t* arr, int len) {
28 qsort(arr, len, 1, sort_asc);
29 int num_uniq = 1;
30 for (int i = 0; i < len - 1; i++)
31 num_uniq += (arr[i] != arr[i + 1]);
32 return num_uniq;
33}
34
35int main() {
36 cudaError_t err; // Needed by SAFE() macro
37 int res;
38 uint8_t *smids_native_d, *smids_native_h;
39 uint8_t *smids_partitioned_d, *smids_partitioned_h;
40 int uniq_native, uniq_partitioned;
41 uint32_t num_tpcs;
42 int num_sms, sms_per_tpc;
43
44 // Determine number of SMs per TPC
45 SAFE(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
46 if (res = libsmctrl_get_tpc_info_cuda(&num_tpcs, 0))
47 error(1, res, "libsmctrl_test_global: Unable to get TPC configuration for test");
48 sms_per_tpc = num_sms/num_tpcs;
49
50 // Test baseline (native) behavior without partitioning
51 SAFE(cudaMalloc(&smids_native_d, NUM_BLOCKS));
52 if (!(smids_native_h = (uint8_t*)malloc(NUM_BLOCKS)))
53 error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test");
54 read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_native_d);
55 SAFE(cudaMemcpy(smids_native_h, smids_native_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
56
57 uniq_native = count_unique(smids_native_h, NUM_BLOCKS);
58 if (uniq_native < sms_per_tpc) {
59 printf("libsmctrl_test_global: ***Test failure.***\n"
60 "libsmctrl_test_global: Reason: In baseline test, %d blocks of 1024 "
61 "threads were launched on the GPU, but only %d SMs were utilized, "
62 "when it was expected that at least %d would be used.\n", NUM_BLOCKS,
63 uniq_native, sms_per_tpc);
64 return 1;
65 }
66
67 // Verify that partitioning changes the SMID distribution
68 libsmctrl_set_global_mask(~0x1); // Enable only one TPC
69 SAFE(cudaMalloc(&smids_partitioned_d, NUM_BLOCKS));
70 if (!(smids_partitioned_h = (uint8_t*)malloc(NUM_BLOCKS)))
71 error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test");
72 read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_partitioned_d);
73 SAFE(cudaMemcpy(smids_partitioned_h, smids_partitioned_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
74
75 // Make sure it only ran on the number of TPCs provided
76 // May run on up to two SMs, as up to two per TPC
77 uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS);
78 if (uniq_partitioned > sms_per_tpc) {
79 printf("libsmctrl_test_global: ***Test failure.***\n"
80 "libsmctrl_test_global: Reason: With global TPC mask set to "
81 "constrain all kernels to a single TPC, a kernel of %d blocks of "
82 "1024 threads was launched and found to run on %d SMs (at most %d---"
83 "one TPC---expected).\n", NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
84 return 1;
85 }
86
87 // Make sure it ran on the right TPC
88 if (smids_partitioned_h[NUM_BLOCKS - 1] > sms_per_tpc - 1) {
89 printf("libsmctrl_test_global: ***Test failure.***\n"
90 "libsmctrl_test_global: Reason: With global TPC mask set to"
91 "constrain all kernels to the first TPC, a kernel was run and found "
92 "to run on an SM ID as high as %d (max of %d expected).\n",
93 smids_partitioned_h[NUM_BLOCKS - 1], sms_per_tpc - 1);
94 return 1;
95 }
96
97 printf("libsmctrl_test_global: Test passed!\n"
98 "libsmctrl_test_global: Reason: With a global partition enabled which "
99 "contained only TPC ID 0, the test kernel was found to use only %d "
100 "SMs (%d without), and all SMs in-use had IDs below %d (were contained"
101 " in the first TPC).\n", uniq_partitioned, uniq_native, sms_per_tpc);
102 return 0;
103}
diff --git a/libsmctrl_test_mask_shared.cu b/libsmctrl_test_mask_shared.cu
new file mode 100644
index 0000000..c3817fa
--- /dev/null
+++ b/libsmctrl_test_mask_shared.cu
@@ -0,0 +1,140 @@
1// Copyright 2023 Joshua Bakita
2#include <error.h>
3#include <errno.h>
4#include <stdio.h>
5#include <stdbool.h>
6#include <cuda_runtime.h>
7
8#include "libsmctrl.h"
9#include "testbench.h"
10#include "libsmctrl_test_mask_shared.h"
11
12__global__ void read_and_store_smid(uint8_t* smid_arr) {
13 if (threadIdx.x != 1)
14 return;
15 int smid;
16 asm("mov.u32 %0, %%smid;" : "=r"(smid));
17 smid_arr[blockIdx.x] = smid;
18}
19
20// Assuming SMs continue to support a maximum of 2048 resident threads, six
21// blocks of 1024 threads should span at least three SMs without partitioning
22#define NUM_BLOCKS 142 //6
23
24static int sort_asc(const void* a, const void* b) {
25 return *(uint8_t*)a - *(uint8_t*)b;
26}
27
28// Warning: Mutates input array via qsort
29static int count_unique(uint8_t* arr, int len) {
30 qsort(arr, len, 1, sort_asc);
31 int num_uniq = 1;
32 for (int i = 0; i < len - 1; i++)
33 num_uniq += (arr[i] != arr[i + 1]);
34 return num_uniq;
35}
36
37// Test that adding an SM mask:
38// 1. Constrains the number of SMs accessible
39// 2. Constrains an application to the correct subset of SMs
40int test_constrained_size_and_location(enum partitioning_type part_type) {
41 int res;
42 uint8_t *smids_native_d, *smids_native_h;
43 uint8_t *smids_partitioned_d, *smids_partitioned_h;
44 int uniq_native, uniq_partitioned;
45 uint32_t num_tpcs;
46 int num_sms, sms_per_tpc;
47 cudaStream_t stream;
48
49 SAFE(cudaStreamCreate(&stream));
50
51 // Determine number of SMs per TPC
52 SAFE(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
53 if (res = libsmctrl_get_tpc_info_cuda(&num_tpcs, 0))
54 error(1, res, "Unable to get TPC configuration for test");
55 sms_per_tpc = num_sms/num_tpcs;
56
57 // Test baseline (native) behavior without partitioning
58 SAFE(cudaMalloc(&smids_native_d, NUM_BLOCKS));
59 if (!(smids_native_h = (uint8_t*)malloc(NUM_BLOCKS)))
60 error(1, errno, "Unable to allocate memory for test");
61 read_and_store_smid<<<NUM_BLOCKS, 1024, 0, stream>>>(smids_native_d);
62 SAFE(cudaMemcpy(smids_native_h, smids_native_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
63
64 uniq_native = count_unique(smids_native_h, NUM_BLOCKS);
65 if (uniq_native < sms_per_tpc) {
66 printf("%s: ***Test failure.***\n"
67 "%s: Reason: In baseline test, %d blocks of 1024 "
68 "threads were launched on the GPU, but only %d SMs were utilized, "
69 "when it was expected that at least %d would be used.\n", program_invocation_name, program_invocation_name, NUM_BLOCKS,
70 uniq_native, sms_per_tpc);
71 return 1;
72 }
73
74 // Test at 32-TPC boundaries to verify that the mask is applied in the
75 // correct order to each of the QMD/stream struct fields.
76 char* reason[4] = {0};
77 for (int enabled_tpc = 0; enabled_tpc < num_tpcs && enabled_tpc < 128; enabled_tpc += 32) {
78 uint128_t mask = 1;
79 mask <<= enabled_tpc;
80 mask = ~mask;
81
82 // Apply partitioning to enable only the first TPC of each 32-bit block
83 switch (part_type) {
84 case PARTITION_GLOBAL:
85 libsmctrl_set_global_mask(mask);
86 break;
87 case PARTITION_STREAM:
88 libsmctrl_set_stream_mask_ext(stream, mask);
89 break;
90 case PARTITION_NEXT:
91 libsmctrl_set_next_mask(mask);
92 break;
93 default:
94 error(1, 0, "Shared test core called with unrecognized partitioning type.");
95 }
96
97 // Verify that partitioning changes the SMID distribution
98 SAFE(cudaMalloc(&smids_partitioned_d, NUM_BLOCKS));
99 if (!(smids_partitioned_h = (uint8_t*)malloc(NUM_BLOCKS)))
100 error(1, errno, "Unable to allocate memory for test");
101 read_and_store_smid<<<NUM_BLOCKS, 1024, 0, stream>>>(smids_partitioned_d);
102 SAFE(cudaMemcpy(smids_partitioned_h, smids_partitioned_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
103
104 // Make sure it only ran on the number of TPCs provided
105 // May run on up to two SMs, as up to two per TPC
106 uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS); // Sorts too
107 if (uniq_partitioned > sms_per_tpc) {
108 printf("%s: ***Test failure.***\n"
109 "%s: Reason: With TPC mask set to "
110 "constrain all kernels to a single TPC, a kernel of %d blocks of "
111 "1024 threads was launched and found to run on %d SMs (at most %d---"
112 "one TPC---expected).\n", program_invocation_name, program_invocation_name, NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
113 return 1;
114 }
115
116 // Make sure it ran on the right TPC
117 if (smids_partitioned_h[NUM_BLOCKS - 1] > (enabled_tpc * sms_per_tpc) + sms_per_tpc - 1 ||
118 smids_partitioned_h[NUM_BLOCKS - 1] < (enabled_tpc * sms_per_tpc)) {
119 printf("%s: ***Test failure.***\n"
120 "%s: Reason: With TPC mask set to"
121 "constrain all kernels to TPC %d, a kernel was run and found "
122 "to run on an SM IDs: as high as %d and as low as %d (range of %d to %d expected).\n",
123 program_invocation_name, program_invocation_name, enabled_tpc, smids_partitioned_h[NUM_BLOCKS - 1], smids_partitioned_h[0], enabled_tpc * sms_per_tpc + sms_per_tpc - 1, enabled_tpc * sms_per_tpc);
124 return 1;
125 }
126
127 // Div by 32 via a shift
128 asprintf(&reason[enabled_tpc >> 5],
129 "With a partition enabled which "
130 "contained only TPC ID %d, the test kernel was found to use only %d "
131 "SMs (%d without), and all SMs in-use had IDs between %d and %d (were contained"
132 " in TPC %d).", enabled_tpc, uniq_partitioned, uniq_native, smids_partitioned_h[0], smids_partitioned_h[NUM_BLOCKS - 1], enabled_tpc);
133 }
134
135 printf("%s: Test passed!\n", program_invocation_name);
136 for (int i = 0; i < 4 && reason[i]; i++)
137 printf("%s: Reason %d: %s\n", program_invocation_name, i + 1, reason[i]);
138 return 0;
139}
140
diff --git a/libsmctrl_test_mask_shared.h b/libsmctrl_test_mask_shared.h
new file mode 100644
index 0000000..f95757d
--- /dev/null
+++ b/libsmctrl_test_mask_shared.h
@@ -0,0 +1,16 @@
1// Copyright 2023 Joshua Bakita
2#ifdef __cplusplus
3extern "C" {
4#endif
5
6enum partitioning_type {
7 PARTITION_GLOBAL,
8 PARTITION_STREAM,
9 PARTITION_NEXT,
10};
11
12extern int test_constrained_size_and_location(enum partitioning_type part_type);
13
14#ifdef __cplusplus
15}
16#endif
diff --git a/libsmctrl_test_next_mask.c b/libsmctrl_test_next_mask.c
new file mode 100644
index 0000000..7faa9e5
--- /dev/null
+++ b/libsmctrl_test_next_mask.c
@@ -0,0 +1,10 @@
1// Copyright 2023 Joshua Bakita
2// Test libsmctrl_set_global_mask().
3// All types of partitioning use the same test, so this file is trival.
4
5#include "libsmctrl_test_mask_shared.h"
6
7int main() {
8 return test_constrained_size_and_location(PARTITION_NEXT);
9}
10
diff --git a/libsmctrl_test_stream_mask.c b/libsmctrl_test_stream_mask.c
new file mode 100644
index 0000000..063f934
--- /dev/null
+++ b/libsmctrl_test_stream_mask.c
@@ -0,0 +1,10 @@
1// Copyright 2023 Joshua Bakita
2// Test libsmctrl_set_global_mask().
3// All types of partitioning use the same test, so this file is trival.
4
5#include "libsmctrl_test_mask_shared.h"
6
7int main() {
8 return test_constrained_size_and_location(PARTITION_STREAM);
9}
10
diff --git a/testbench.h b/testbench.h
new file mode 100644
index 0000000..dff1211
--- /dev/null
+++ b/testbench.h
@@ -0,0 +1,31 @@
1/* Copyright 2021-2023 Joshua Bakita
2 * Header for miscellaneous experimental helper functions.
3 */
4
5// cudaError_t and CUResult can both safely be cast to an unsigned int
6static __thread unsigned int __SAFE_err;
7
8// The very strange cast in these macros is to satisfy two goals at tension:
9// 1. This file should be able to be included in non-CUDA-using files, and thus
10// should use no CUDA types outside of this macro.
11// 2. We want to typecheck uses of these macros. The driver and runtime APIs
12// do not have identical error numbers and/or meanings, so runtime library
13// calls should use SAFE, and driver library calls should use SAFE_D.
14// These macros allow typechecking, but keep a generic global error variable.
15#define SAFE(x) \
16 if ((*(cudaError_t*)(&__SAFE_err) = (x)) != 0) { \
17 printf("(%s:%d) CUDA error %d: %s i.e. \"%s\" returned by %s. Aborting...\n", \
18 __FILE__, __LINE__, __SAFE_err, cudaGetErrorName((cudaError_t)__SAFE_err), cudaGetErrorString((cudaError_t)__SAFE_err), #x); \
19 exit(1); \
20 }
21
22#define SAFE_D(x) \
23 if ((*(CUresult*)&(__SAFE_err) = (x)) != 0) { \
24 const char* name; \
25 const char* desc; \
26 cuGetErrorName((CUresult)__SAFE_err, &name); \
27 cuGetErrorString((CUresult)__SAFE_err, &desc); \
28 printf("(%s:%d) CUDA error %d: %s i.e. \"%s\" returned by %s. Aborting...\n", \
29 __FILE__, __LINE__, __SAFE_err, name, desc, #x); \
30 exit(1); \
31 }