diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2023-10-05 15:28:08 -0400 |
---|---|---|
committer | Joshua Bakita <jbakita@cs.unc.edu> | 2023-10-16 12:15:58 -0400 |
commit | 6e552809ce6c3fc73ac3e95c8d971a972b842e4b (patch) | |
tree | 827c23d505b0fddf29d9803b11a5f7d54c3ca25c | |
parent | 9ed721de0e9ce564b7c852e38359398b019a5c2f (diff) |
Fix libsmctrl_set_stream_mask() on the TX2 with CUDA 9.0 + cleanup
This function was previously unreliable when using CUDA 9.0 on the
Jetson TX2.
Also update some version comments and remove `set_sm_mask()`---a
legacy partitioning function that's no longer used.
-rw-r--r-- | libsmctrl.c | 68 | ||||
-rw-r--r-- | libsmctrl.h | 11 |
2 files changed, 56 insertions, 23 deletions
diff --git a/libsmctrl.c b/libsmctrl.c index 98be1ef..94578a1 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
@@ -1,21 +1,15 @@ | |||
1 | /** | 1 | /** |
2 | * Copyright 2022 Joshua Bakita | 2 | * Copyright 2023 Joshua Bakita |
3 | * Library to control SM masks on CUDA launches. Co-opts preexisting debug | 3 | * Library to control SM masks on CUDA launches. Co-opts preexisting debug |
4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. | 4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. |
5 | */ | 5 | */ |
6 | |||
7 | //#include "/playpen/playpen/cuda-11.8/include/cuda.h" | ||
8 | #include <cuda.h> | 6 | #include <cuda.h> |
9 | //#include <cuda_runtime.h> | ||
10 | //#ifndef CUDA_VERSION | ||
11 | //#warning libsmctrl: CUDA driver library must be included before libsmctrl.h. | ||
12 | //#endif | ||
13 | 7 | ||
14 | #include <stdint.h> | ||
15 | #include <errno.h> | 8 | #include <errno.h> |
16 | #include <fcntl.h> | 9 | #include <fcntl.h> |
17 | #include <unistd.h> | 10 | #include <stdint.h> |
18 | #include <stdio.h> | 11 | #include <stdio.h> |
12 | #include <unistd.h> | ||
19 | 13 | ||
20 | // Layout of mask control fields in CUDA's `globals` struct | 14 | // Layout of mask control fields in CUDA's `globals` struct |
21 | struct global_sm_control { | 15 | struct global_sm_control { |
@@ -65,7 +59,7 @@ static void setup_sm_control_10() { | |||
65 | 59 | ||
66 | /*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ | 60 | /*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ |
67 | 61 | ||
68 | // Tested working on CUDA x86_64 11.0-11.8. | 62 | // Tested working on CUDA x86_64 11.0-12.2. |
69 | // Tested not working on aarch64 or x86_64 10.2 | 63 | // Tested not working on aarch64 or x86_64 10.2 |
70 | static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; | 64 | static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; |
71 | #define LAUNCH_DOMAIN 0x3 | 65 | #define LAUNCH_DOMAIN 0x3 |
@@ -141,10 +135,6 @@ void libsmctrl_set_global_mask(uint64_t mask) { | |||
141 | } | 135 | } |
142 | } | 136 | } |
143 | 137 | ||
144 | void set_sm_mask(uint64_t mask) { | ||
145 | libsmctrl_set_global_mask(mask); | ||
146 | } | ||
147 | |||
148 | // Set mask for next launch from this thread | 138 | // Set mask for next launch from this thread |
149 | void libsmctrl_set_next_mask(uint64_t mask) { | 139 | void libsmctrl_set_next_mask(uint64_t mask) { |
150 | if (!sm_control_setup_called) | 140 | if (!sm_control_setup_called) |
@@ -157,6 +147,7 @@ void libsmctrl_set_next_mask(uint64_t mask) { | |||
157 | 147 | ||
158 | #define CU_8_0_MASK_OFF 0xec | 148 | #define CU_8_0_MASK_OFF 0xec |
159 | #define CU_9_0_MASK_OFF 0x130 | 149 | #define CU_9_0_MASK_OFF 0x130 |
150 | #define CU_9_0_MASK_OFF_TX2 0x128 // CUDA 9.0 is slightly different on the TX2 | ||
160 | // CUDA 9.0 and 9.1 use the same offset | 151 | // CUDA 9.0 and 9.1 use the same offset |
161 | #define CU_9_2_MASK_OFF 0x140 | 152 | #define CU_9_2_MASK_OFF 0x140 |
162 | #define CU_10_0_MASK_OFF 0x24c | 153 | #define CU_10_0_MASK_OFF 0x24c |
@@ -177,7 +168,35 @@ struct stream_sm_mask { | |||
177 | uint32_t lower; | 168 | uint32_t lower; |
178 | } __attribute__((packed)); | 169 | } __attribute__((packed)); |
179 | 170 | ||
180 | // Should work for CUDA 9.1, 10.0-11.8, 12.0-12.1 | 171 | // Check if this system has a Parker SoC (TX2/PX2 chip) |
172 | // (CUDA 9.0 behaves slightly different on this platform.) | ||
173 | // @return 1 if detected, 0 if not, -cuda_err on error | ||
174 | #if __aarch64__ | ||
175 | int detect_parker_soc() { | ||
176 | int cap_major, cap_minor, err, dev_count; | ||
177 | if (err = cuDeviceGetCount(&dev_count)) | ||
178 | return -err; | ||
179 | // As CUDA devices are numbered by order of compute power, check every | ||
180 | // device, in case a powerful discrete GPU is attached (such as on the | ||
181 | // DRIVE PX2). We detect the Parker SoC via its unique CUDA compute | ||
182 | // capability: 6.2. | ||
183 | for (int i = 0; i < dev_count; i++) { | ||
184 | if (err = cuDeviceGetAttribute(&cap_minor, | ||
185 | CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, | ||
186 | i)) | ||
187 | return -err; | ||
188 | if (err = cuDeviceGetAttribute(&cap_major, | ||
189 | CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, | ||
190 | i)) | ||
191 | return -err; | ||
192 | if (cap_major == 6 && cap_minor == 2) | ||
193 | return 1; | ||
194 | } | ||
195 | return 0; | ||
196 | } | ||
197 | #endif // __aarch64__ | ||
198 | |||
199 | // Should work for CUDA 8.0 through 12.1 | ||
181 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in | 200 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in |
182 | // our header | 201 | // our header |
183 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | 202 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { |
@@ -189,9 +208,26 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | |||
189 | case 8000: | 208 | case 8000: |
190 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); | 209 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); |
191 | case 9000: | 210 | case 9000: |
192 | case 9010: | 211 | case 9010: { |
193 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); | 212 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); |
213 | #if __aarch64__ | ||
214 | // Jetson TX2 offset is slightly different on CUDA 9.0. | ||
215 | // Only compile the check into ARM64 builds. | ||
216 | int is_parker; | ||
217 | const char* err_str; | ||
218 | if ((is_parker = detect_parker_soc()) < 0) { | ||
219 | cuGetErrorName(-is_parker, &err_str); | ||
220 | fprintf(stderr, "libsmctrl_set_stream_mask: CUDA call " | ||
221 | "failed while doing compatibilty test." | ||
222 | "Error, '%s'. Not applying stream " | ||
223 | "mask.\n", err_str); | ||
224 | } | ||
225 | |||
226 | if (is_parker) | ||
227 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_TX2); | ||
228 | #endif | ||
194 | break; | 229 | break; |
230 | } | ||
195 | case 9020: | 231 | case 9020: |
196 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF); | 232 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF); |
197 | break; | 233 | break; |
diff --git a/libsmctrl.h b/libsmctrl.h index 64ae7a7..f144437 100644 --- a/libsmctrl.h +++ b/libsmctrl.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /** | 1 | /** |
2 | * Copyright 2022 Joshua Bakita | 2 | * Copyright 2023 Joshua Bakita |
3 | * Library to control TPC masks on CUDA launches. Co-opts preexisting debug | 3 | * Library to control TPC masks on CUDA launches. Co-opts preexisting debug |
4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. | 4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. |
5 | */ | 5 | */ |
@@ -12,23 +12,20 @@ extern "C" { | |||
12 | 12 | ||
13 | // Set global default TPC mask for all kernels, incl. CUDA-internal ones | 13 | // Set global default TPC mask for all kernels, incl. CUDA-internal ones |
14 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 14 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
15 | // Supported: CUDA 10.2, and CUDA 11.0 - CUDA 11.8 | 15 | // Supported: CUDA 10.2, and CUDA 11.0 - CUDA 12.1 |
16 | extern void libsmctrl_set_global_mask(uint64_t mask); | 16 | extern void libsmctrl_set_global_mask(uint64_t mask); |
17 | // Set default TPC mask for all kernels launched via `stream` | 17 | // Set default TPC mask for all kernels launched via `stream` |
18 | // (overrides global mask) | 18 | // (overrides global mask) |
19 | // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on | 19 | // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on |
20 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 20 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
21 | // Supported: CUDA 8.0 - CUDA 11.8 | 21 | // Supported: CUDA 8.0 - CUDA 12.1 |
22 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); | 22 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); |
23 | // Set TPC mask for the next kernel launch from the caller's CPU thread | 23 | // Set TPC mask for the next kernel launch from the caller's CPU thread |
24 | // (overrides global and per-stream masks, applies only to next launch). | 24 | // (overrides global and per-stream masks, applies only to next launch). |
25 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 25 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
26 | // Supported: CUDA 11.0 - CUDA 11.8 | 26 | // Supported: CUDA 11.0 - CUDA 12.1 |
27 | extern void libsmctrl_set_next_mask(uint64_t mask); | 27 | extern void libsmctrl_set_next_mask(uint64_t mask); |
28 | 28 | ||
29 | // **DEPRECATED**: Old name for libsmctrl_set_global_mask() | ||
30 | extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_set_global_mask()"))); | ||
31 | |||
32 | /** | 29 | /** |
33 | * Notes on Bitmasks | 30 | * Notes on Bitmasks |
34 | * | 31 | * |