diff options
| author | Joshua Bakita <bakitajoshua@gmail.com> | 2023-10-05 15:28:08 -0400 |
|---|---|---|
| committer | Joshua Bakita <jbakita@cs.unc.edu> | 2023-10-16 12:15:58 -0400 |
| commit | 6e552809ce6c3fc73ac3e95c8d971a972b842e4b (patch) | |
| tree | 827c23d505b0fddf29d9803b11a5f7d54c3ca25c | |
| parent | 9ed721de0e9ce564b7c852e38359398b019a5c2f (diff) | |
Fix libsmctrl_set_stream_mask() on the TX2 with CUDA 9.0 + cleanup
This function was previously unreliable when using CUDA 9.0 on the
Jetson TX2.
Also update some version comments and remove `set_sm_mask()`---a
legacy partitioning function that's no longer used.
| -rw-r--r-- | libsmctrl.c | 68 | ||||
| -rw-r--r-- | libsmctrl.h | 11 |
2 files changed, 56 insertions, 23 deletions
diff --git a/libsmctrl.c b/libsmctrl.c index 98be1ef..94578a1 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
| @@ -1,21 +1,15 @@ | |||
| 1 | /** | 1 | /** |
| 2 | * Copyright 2022 Joshua Bakita | 2 | * Copyright 2023 Joshua Bakita |
| 3 | * Library to control SM masks on CUDA launches. Co-opts preexisting debug | 3 | * Library to control SM masks on CUDA launches. Co-opts preexisting debug |
| 4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. | 4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. |
| 5 | */ | 5 | */ |
| 6 | |||
| 7 | //#include "/playpen/playpen/cuda-11.8/include/cuda.h" | ||
| 8 | #include <cuda.h> | 6 | #include <cuda.h> |
| 9 | //#include <cuda_runtime.h> | ||
| 10 | //#ifndef CUDA_VERSION | ||
| 11 | //#warning libsmctrl: CUDA driver library must be included before libsmctrl.h. | ||
| 12 | //#endif | ||
| 13 | 7 | ||
| 14 | #include <stdint.h> | ||
| 15 | #include <errno.h> | 8 | #include <errno.h> |
| 16 | #include <fcntl.h> | 9 | #include <fcntl.h> |
| 17 | #include <unistd.h> | 10 | #include <stdint.h> |
| 18 | #include <stdio.h> | 11 | #include <stdio.h> |
| 12 | #include <unistd.h> | ||
| 19 | 13 | ||
| 20 | // Layout of mask control fields in CUDA's `globals` struct | 14 | // Layout of mask control fields in CUDA's `globals` struct |
| 21 | struct global_sm_control { | 15 | struct global_sm_control { |
| @@ -65,7 +59,7 @@ static void setup_sm_control_10() { | |||
| 65 | 59 | ||
| 66 | /*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ | 60 | /*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ |
| 67 | 61 | ||
| 68 | // Tested working on CUDA x86_64 11.0-11.8. | 62 | // Tested working on CUDA x86_64 11.0-12.2. |
| 69 | // Tested not working on aarch64 or x86_64 10.2 | 63 | // Tested not working on aarch64 or x86_64 10.2 |
| 70 | static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; | 64 | static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; |
| 71 | #define LAUNCH_DOMAIN 0x3 | 65 | #define LAUNCH_DOMAIN 0x3 |
| @@ -141,10 +135,6 @@ void libsmctrl_set_global_mask(uint64_t mask) { | |||
| 141 | } | 135 | } |
| 142 | } | 136 | } |
| 143 | 137 | ||
| 144 | void set_sm_mask(uint64_t mask) { | ||
| 145 | libsmctrl_set_global_mask(mask); | ||
| 146 | } | ||
| 147 | |||
| 148 | // Set mask for next launch from this thread | 138 | // Set mask for next launch from this thread |
| 149 | void libsmctrl_set_next_mask(uint64_t mask) { | 139 | void libsmctrl_set_next_mask(uint64_t mask) { |
| 150 | if (!sm_control_setup_called) | 140 | if (!sm_control_setup_called) |
| @@ -157,6 +147,7 @@ void libsmctrl_set_next_mask(uint64_t mask) { | |||
| 157 | 147 | ||
| 158 | #define CU_8_0_MASK_OFF 0xec | 148 | #define CU_8_0_MASK_OFF 0xec |
| 159 | #define CU_9_0_MASK_OFF 0x130 | 149 | #define CU_9_0_MASK_OFF 0x130 |
| 150 | #define CU_9_0_MASK_OFF_TX2 0x128 // CUDA 9.0 is slightly different on the TX2 | ||
| 160 | // CUDA 9.0 and 9.1 use the same offset | 151 | // CUDA 9.0 and 9.1 use the same offset |
| 161 | #define CU_9_2_MASK_OFF 0x140 | 152 | #define CU_9_2_MASK_OFF 0x140 |
| 162 | #define CU_10_0_MASK_OFF 0x24c | 153 | #define CU_10_0_MASK_OFF 0x24c |
| @@ -177,7 +168,35 @@ struct stream_sm_mask { | |||
| 177 | uint32_t lower; | 168 | uint32_t lower; |
| 178 | } __attribute__((packed)); | 169 | } __attribute__((packed)); |
| 179 | 170 | ||
| 180 | // Should work for CUDA 9.1, 10.0-11.8, 12.0-12.1 | 171 | // Check if this system has a Parker SoC (TX2/PX2 chip) |
| 172 | // (CUDA 9.0 behaves slightly different on this platform.) | ||
| 173 | // @return 1 if detected, 0 if not, -cuda_err on error | ||
| 174 | #if __aarch64__ | ||
| 175 | int detect_parker_soc() { | ||
| 176 | int cap_major, cap_minor, err, dev_count; | ||
| 177 | if (err = cuDeviceGetCount(&dev_count)) | ||
| 178 | return -err; | ||
| 179 | // As CUDA devices are numbered by order of compute power, check every | ||
| 180 | // device, in case a powerful discrete GPU is attached (such as on the | ||
| 181 | // DRIVE PX2). We detect the Parker SoC via its unique CUDA compute | ||
| 182 | // capability: 6.2. | ||
| 183 | for (int i = 0; i < dev_count; i++) { | ||
| 184 | if (err = cuDeviceGetAttribute(&cap_minor, | ||
| 185 | CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, | ||
| 186 | i)) | ||
| 187 | return -err; | ||
| 188 | if (err = cuDeviceGetAttribute(&cap_major, | ||
| 189 | CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, | ||
| 190 | i)) | ||
| 191 | return -err; | ||
| 192 | if (cap_major == 6 && cap_minor == 2) | ||
| 193 | return 1; | ||
| 194 | } | ||
| 195 | return 0; | ||
| 196 | } | ||
| 197 | #endif // __aarch64__ | ||
| 198 | |||
| 199 | // Should work for CUDA 8.0 through 12.1 | ||
| 181 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in | 200 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in |
| 182 | // our header | 201 | // our header |
| 183 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | 202 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { |
| @@ -189,9 +208,26 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | |||
| 189 | case 8000: | 208 | case 8000: |
| 190 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); | 209 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); |
| 191 | case 9000: | 210 | case 9000: |
| 192 | case 9010: | 211 | case 9010: { |
| 193 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); | 212 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); |
| 213 | #if __aarch64__ | ||
| 214 | // Jetson TX2 offset is slightly different on CUDA 9.0. | ||
| 215 | // Only compile the check into ARM64 builds. | ||
| 216 | int is_parker; | ||
| 217 | const char* err_str; | ||
| 218 | if ((is_parker = detect_parker_soc()) < 0) { | ||
| 219 | cuGetErrorName(-is_parker, &err_str); | ||
| 220 | fprintf(stderr, "libsmctrl_set_stream_mask: CUDA call " | ||
| 221 | "failed while doing compatibilty test." | ||
| 222 | "Error, '%s'. Not applying stream " | ||
| 223 | "mask.\n", err_str); | ||
| 224 | } | ||
| 225 | |||
| 226 | if (is_parker) | ||
| 227 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_TX2); | ||
| 228 | #endif | ||
| 194 | break; | 229 | break; |
| 230 | } | ||
| 195 | case 9020: | 231 | case 9020: |
| 196 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF); | 232 | hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF); |
| 197 | break; | 233 | break; |
diff --git a/libsmctrl.h b/libsmctrl.h index 64ae7a7..f144437 100644 --- a/libsmctrl.h +++ b/libsmctrl.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /** | 1 | /** |
| 2 | * Copyright 2022 Joshua Bakita | 2 | * Copyright 2023 Joshua Bakita |
| 3 | * Library to control TPC masks on CUDA launches. Co-opts preexisting debug | 3 | * Library to control TPC masks on CUDA launches. Co-opts preexisting debug |
| 4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. | 4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. |
| 5 | */ | 5 | */ |
| @@ -12,23 +12,20 @@ extern "C" { | |||
| 12 | 12 | ||
| 13 | // Set global default TPC mask for all kernels, incl. CUDA-internal ones | 13 | // Set global default TPC mask for all kernels, incl. CUDA-internal ones |
| 14 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 14 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
| 15 | // Supported: CUDA 10.2, and CUDA 11.0 - CUDA 11.8 | 15 | // Supported: CUDA 10.2, and CUDA 11.0 - CUDA 12.1 |
| 16 | extern void libsmctrl_set_global_mask(uint64_t mask); | 16 | extern void libsmctrl_set_global_mask(uint64_t mask); |
| 17 | // Set default TPC mask for all kernels launched via `stream` | 17 | // Set default TPC mask for all kernels launched via `stream` |
| 18 | // (overrides global mask) | 18 | // (overrides global mask) |
| 19 | // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on | 19 | // @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on |
| 20 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 20 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
| 21 | // Supported: CUDA 8.0 - CUDA 11.8 | 21 | // Supported: CUDA 8.0 - CUDA 12.1 |
| 22 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); | 22 | extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); |
| 23 | // Set TPC mask for the next kernel launch from the caller's CPU thread | 23 | // Set TPC mask for the next kernel launch from the caller's CPU thread |
| 24 | // (overrides global and per-stream masks, applies only to next launch). | 24 | // (overrides global and per-stream masks, applies only to next launch). |
| 25 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) | 25 | // @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) |
| 26 | // Supported: CUDA 11.0 - CUDA 11.8 | 26 | // Supported: CUDA 11.0 - CUDA 12.1 |
| 27 | extern void libsmctrl_set_next_mask(uint64_t mask); | 27 | extern void libsmctrl_set_next_mask(uint64_t mask); |
| 28 | 28 | ||
| 29 | // **DEPRECATED**: Old name for libsmctrl_set_global_mask() | ||
| 30 | extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_set_global_mask()"))); | ||
| 31 | |||
| 32 | /** | 29 | /** |
| 33 | * Notes on Bitmasks | 30 | * Notes on Bitmasks |
| 34 | * | 31 | * |
