aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2023-10-05 15:28:08 -0400
committerJoshua Bakita <jbakita@cs.unc.edu>2023-10-16 12:15:58 -0400
commit6e552809ce6c3fc73ac3e95c8d971a972b842e4b (patch)
tree827c23d505b0fddf29d9803b11a5f7d54c3ca25c
parent9ed721de0e9ce564b7c852e38359398b019a5c2f (diff)
Fix libsmctrl_set_stream_mask() on the TX2 with CUDA 9.0 + cleanup
This function was previously unreliable when using CUDA 9.0 on the Jetson TX2. Also update some version comments and remove `set_sm_mask()`---a legacy partitioning function that's no longer used.
-rw-r--r--libsmctrl.c68
-rw-r--r--libsmctrl.h11
2 files changed, 56 insertions, 23 deletions
diff --git a/libsmctrl.c b/libsmctrl.c
index 98be1ef..94578a1 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -1,21 +1,15 @@
1/** 1/**
2 * Copyright 2022 Joshua Bakita 2 * Copyright 2023 Joshua Bakita
3 * Library to control SM masks on CUDA launches. Co-opts preexisting debug 3 * Library to control SM masks on CUDA launches. Co-opts preexisting debug
4 * logic in the CUDA driver library, and thus requires a build with -lcuda. 4 * logic in the CUDA driver library, and thus requires a build with -lcuda.
5 */ 5 */
6
7//#include "/playpen/playpen/cuda-11.8/include/cuda.h"
8#include <cuda.h> 6#include <cuda.h>
9//#include <cuda_runtime.h>
10//#ifndef CUDA_VERSION
11//#warning libsmctrl: CUDA driver library must be included before libsmctrl.h.
12//#endif
13 7
14#include <stdint.h>
15#include <errno.h> 8#include <errno.h>
16#include <fcntl.h> 9#include <fcntl.h>
17#include <unistd.h> 10#include <stdint.h>
18#include <stdio.h> 11#include <stdio.h>
12#include <unistd.h>
19 13
20// Layout of mask control fields in CUDA's `globals` struct 14// Layout of mask control fields in CUDA's `globals` struct
21struct global_sm_control { 15struct global_sm_control {
@@ -65,7 +59,7 @@ static void setup_sm_control_10() {
65 59
66/*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/ 60/*** QMD/TMD-based SM Mask Control via Debug Callback. CUDA 11+ ***/
67 61
68// Tested working on CUDA x86_64 11.0-11.8. 62// Tested working on CUDA x86_64 11.0-12.2.
69// Tested not working on aarch64 or x86_64 10.2 63// Tested not working on aarch64 or x86_64 10.2
70static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; 64static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b};
71#define LAUNCH_DOMAIN 0x3 65#define LAUNCH_DOMAIN 0x3
@@ -141,10 +135,6 @@ void libsmctrl_set_global_mask(uint64_t mask) {
141 } 135 }
142} 136}
143 137
144void set_sm_mask(uint64_t mask) {
145 libsmctrl_set_global_mask(mask);
146}
147
148// Set mask for next launch from this thread 138// Set mask for next launch from this thread
149void libsmctrl_set_next_mask(uint64_t mask) { 139void libsmctrl_set_next_mask(uint64_t mask) {
150 if (!sm_control_setup_called) 140 if (!sm_control_setup_called)
@@ -157,6 +147,7 @@ void libsmctrl_set_next_mask(uint64_t mask) {
157 147
158#define CU_8_0_MASK_OFF 0xec 148#define CU_8_0_MASK_OFF 0xec
159#define CU_9_0_MASK_OFF 0x130 149#define CU_9_0_MASK_OFF 0x130
150#define CU_9_0_MASK_OFF_TX2 0x128 // CUDA 9.0 is slightly different on the TX2
160// CUDA 9.0 and 9.1 use the same offset 151// CUDA 9.0 and 9.1 use the same offset
161#define CU_9_2_MASK_OFF 0x140 152#define CU_9_2_MASK_OFF 0x140
162#define CU_10_0_MASK_OFF 0x24c 153#define CU_10_0_MASK_OFF 0x24c
@@ -177,7 +168,35 @@ struct stream_sm_mask {
177 uint32_t lower; 168 uint32_t lower;
178} __attribute__((packed)); 169} __attribute__((packed));
179 170
180// Should work for CUDA 9.1, 10.0-11.8, 12.0-12.1 171// Check if this system has a Parker SoC (TX2/PX2 chip)
172// (CUDA 9.0 behaves slightly different on this platform.)
173// @return 1 if detected, 0 if not, -cuda_err on error
174#if __aarch64__
175int detect_parker_soc() {
176 int cap_major, cap_minor, err, dev_count;
177 if (err = cuDeviceGetCount(&dev_count))
178 return -err;
179 // As CUDA devices are numbered by order of compute power, check every
180 // device, in case a powerful discrete GPU is attached (such as on the
181 // DRIVE PX2). We detect the Parker SoC via its unique CUDA compute
182 // capability: 6.2.
183 for (int i = 0; i < dev_count; i++) {
184 if (err = cuDeviceGetAttribute(&cap_minor,
185 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
186 i))
187 return -err;
188 if (err = cuDeviceGetAttribute(&cap_major,
189 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
190 i))
191 return -err;
192 if (cap_major == 6 && cap_minor == 2)
193 return 1;
194 }
195 return 0;
196}
197#endif // __aarch64__
198
199// Should work for CUDA 8.0 through 12.1
181// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in 200// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in
182// our header 201// our header
183void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { 202void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
@@ -189,9 +208,26 @@ void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
189 case 8000: 208 case 8000:
190 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF); 209 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF);
191 case 9000: 210 case 9000:
192 case 9010: 211 case 9010: {
193 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF); 212 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF);
213#if __aarch64__
214 // Jetson TX2 offset is slightly different on CUDA 9.0.
215 // Only compile the check into ARM64 builds.
216 int is_parker;
217 const char* err_str;
218 if ((is_parker = detect_parker_soc()) < 0) {
219 cuGetErrorName(-is_parker, &err_str);
220 fprintf(stderr, "libsmctrl_set_stream_mask: CUDA call "
221 "failed while doing compatibilty test."
222 "Error, '%s'. Not applying stream "
223 "mask.\n", err_str);
224 }
225
226 if (is_parker)
227 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_TX2);
228#endif
194 break; 229 break;
230 }
195 case 9020: 231 case 9020:
196 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF); 232 hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF);
197 break; 233 break;
diff --git a/libsmctrl.h b/libsmctrl.h
index 64ae7a7..f144437 100644
--- a/libsmctrl.h
+++ b/libsmctrl.h
@@ -1,5 +1,5 @@
1/** 1/**
2 * Copyright 2022 Joshua Bakita 2 * Copyright 2023 Joshua Bakita
3 * Library to control TPC masks on CUDA launches. Co-opts preexisting debug 3 * Library to control TPC masks on CUDA launches. Co-opts preexisting debug
4 * logic in the CUDA driver library, and thus requires a build with -lcuda. 4 * logic in the CUDA driver library, and thus requires a build with -lcuda.
5 */ 5 */
@@ -12,23 +12,20 @@ extern "C" {
12 12
13// Set global default TPC mask for all kernels, incl. CUDA-internal ones 13// Set global default TPC mask for all kernels, incl. CUDA-internal ones
14// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) 14// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
15// Supported: CUDA 10.2, and CUDA 11.0 - CUDA 11.8 15// Supported: CUDA 10.2, and CUDA 11.0 - CUDA 12.1
16extern void libsmctrl_set_global_mask(uint64_t mask); 16extern void libsmctrl_set_global_mask(uint64_t mask);
17// Set default TPC mask for all kernels launched via `stream` 17// Set default TPC mask for all kernels launched via `stream`
18// (overrides global mask) 18// (overrides global mask)
19// @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on 19// @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on
20// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) 20// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
21// Supported: CUDA 8.0 - CUDA 11.8 21// Supported: CUDA 8.0 - CUDA 12.1
22extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask); 22extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
23// Set TPC mask for the next kernel launch from the caller's CPU thread 23// Set TPC mask for the next kernel launch from the caller's CPU thread
24// (overrides global and per-stream masks, applies only to next launch). 24// (overrides global and per-stream masks, applies only to next launch).
25// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks) 25// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
26// Supported: CUDA 11.0 - CUDA 11.8 26// Supported: CUDA 11.0 - CUDA 12.1
27extern void libsmctrl_set_next_mask(uint64_t mask); 27extern void libsmctrl_set_next_mask(uint64_t mask);
28 28
29// **DEPRECATED**: Old name for libsmctrl_set_global_mask()
30extern void set_sm_mask(uint64_t mask) __attribute__((deprecated("Use libsmctrl_set_global_mask()")));
31
32/** 29/**
33 * Notes on Bitmasks 30 * Notes on Bitmasks
34 * 31 *