aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <jbakita@cs.unc.edu>2025-06-16 19:29:07 -0400
committerJoshua Bakita <jbakita@cs.unc.edu>2025-06-17 14:01:49 -0400
commit89177fce34edb5ad0059a41548888d05588cc1c5 (patch)
tree096dc302bb5e17e3987c45a59ef02c69ec73e9ed
parent03ae77e35d35b2a82f5387d1903cfa954b696edd (diff)
Rewrite nvtaskset and implementation of partitioning for unmodified tasks
Rather than requiring libsmctrl.so to be preloaded, we now wrap libcuda.so.1. All CUDA-using applications will load libcuda.so.1, ensuring that our wrapper will always be dynamically loaded, no matter if LD_PRELOAD is enabled, or if a program has been staticly linked. All that needs to be done is that the location of our "fake" libcuda.so.1 need to be put within the loader search path. This can be done by setting LD_LIBRARY_PATH, or by installing our wrapper into /lib/x86_64-linux-gnu. The mask can still be set via the LIBSMCTRL_MASK environment variable, but the easier-to-use nvtaskset tool is now the recommended way to view or change the supreme TPC mask for any CUDA-using application. This allows launching a program on the first two GPCs via a command as simple as: ./nvtaskset -g 0-1 ./a_program a_program_args (Note that use of the -g option requires the nvdebug kernel module to first be loaded.) These changes support the final version of the ECRTS'25 paper. Note that nvtaskset does not yet fully support multi-GPU systems. Bugfixes: - Fix crash that would occur if both libsmctrl.so and libsmctrl.a were built into an application. - Correctly use GPU ID when initializing a context in `libsmctrl_test_gpc_info`. - Include `nvtaskset` as a prerequisite for `libsmctrl_test_supreme_mask`. - Fix malfunction of `libsmctrl_test_gpc_info` if CUDA_VISIBLE_DEVICES is set. Other minor changes: - Adds make target to run all the tests. - Fixes typos in comments. - Enables -Wall build option. - Upgrades supreme mask from 64 to 128 bits. - Removes `detect_parker_soc()` from the global namespace. - Adjusts test messages to be more succinct. - Updates README with overview of how to partition unmodified applications, more details on the tests, and information on the new ECRTS'25 paper.
-rw-r--r--.gitignore2
-rw-r--r--Makefile47
-rw-r--r--README.md83
-rw-r--r--libsmctrl.c212
-rw-r--r--libsmctrl_test_gpc_info.c4
-rw-r--r--libsmctrl_test_mask_shared.cu31
-rw-r--r--nvtaskset.c520
7 files changed, 659 insertions, 240 deletions
diff --git a/.gitignore b/.gitignore
index 5f0fdbe..c42b364 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,8 @@ libsmctrl_test_stream_mask
8libsmctrl_test_stream_mask_override 8libsmctrl_test_stream_mask_override
9libsmctrl_test_next_mask 9libsmctrl_test_next_mask
10libsmctrl_test_next_mask_override 10libsmctrl_test_next_mask_override
11libcuda.so.1
12nvtaskset
11*.pyc 13*.pyc
12*.o 14*.o
13.gdb_history 15.gdb_history
diff --git a/Makefile b/Makefile
index 62ec245..87a5708 100644
--- a/Makefile
+++ b/Makefile
@@ -3,9 +3,11 @@ CUDA ?= /usr/local/cuda
3# Note that CXX and CC are predefined as g++ and cc (respectively) by Make 3# Note that CXX and CC are predefined as g++ and cc (respectively) by Make
4NVCC ?= $(CUDA)/bin/nvcc 4NVCC ?= $(CUDA)/bin/nvcc
5# Everything has to have -lcuda, as it's needed for libsmctrl 5# Everything has to have -lcuda, as it's needed for libsmctrl
6LDFLAGS := -lcuda -I$(CUDA)/include -L$(CUDA)/lib64 6LDFLAGS := -ldl -lcuda -I$(CUDA)/include -L$(CUDA)/lib64
7ARCH = $(shell $(CC) -dumpmachine)
8CFLAGS := -Wall -Wno-parentheses
7 9
8.PHONY: clean tests all 10.PHONY: clean tests all install remove run_tests
9 11
10# ----- Main Library ----- 12# ----- Main Library -----
11libsmctrl.so: libsmctrl.c libsmctrl.h 13libsmctrl.so: libsmctrl.c libsmctrl.h
@@ -14,9 +16,14 @@ libsmctrl.so: libsmctrl.c libsmctrl.h
14# -fPIC is needed even if built as a static library, in case we are linked into 16# -fPIC is needed even if built as a static library, in case we are linked into
15# another shared library 17# another shared library
16libsmctrl.a: libsmctrl.c libsmctrl.h 18libsmctrl.a: libsmctrl.c libsmctrl.h
17 $(CC) $< -c -o libsmctrl.o -fPIC $(CFLAGS) $(LDFLAGS) 19 $(CC) $< -c -o libsmctrl.o -fPIC -DLIBSMCTRL_STATIC $(CFLAGS) $(LDFLAGS)
18 ar rcs $@ libsmctrl.o 20 ar rcs $@ libsmctrl.o
19 21
22# ----- CUDA Wrapper -----
23libcuda.so.1: libsmctrl.c libsmctrl.h
24 $(CC) $< -shared -o $@ -fPIC -DLIBSMCTRL_WRAPPER $(CFLAGS) $(LDFLAGS)
25 patchelf libcuda.so.1 --add-needed libcuda.so
26
20# ----- Utilities ----- 27# ----- Utilities -----
21# Use static linking with tests to avoid LD_LIBRARY_PATH issues 28# Use static linking with tests to avoid LD_LIBRARY_PATH issues
22nvtaskset: nvtaskset.c libsmctrl.so libsmctrl.a 29nvtaskset: nvtaskset.c libsmctrl.so libsmctrl.a
@@ -29,7 +36,7 @@ libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c libsmctrl.a testbench.h
29libsmctrl_test_mask_shared.o: libsmctrl_test_mask_shared.cu testbench.h 36libsmctrl_test_mask_shared.o: libsmctrl_test_mask_shared.cu testbench.h
30 $(NVCC) -ccbin $(CXX) $< -c -g 37 $(NVCC) -ccbin $(CXX) $< -c -g
31 38
32libsmctrl_test_supreme_mask: libsmctrl_test_supreme_mask.c libsmctrl.a libsmctrl_test_mask_shared.o 39libsmctrl_test_supreme_mask: libsmctrl_test_supreme_mask.c libsmctrl.a libsmctrl_test_mask_shared.o libcuda.so.1 nvtaskset
33 $(NVCC) -ccbin $(CXX) $@.c -o $@ libsmctrl_test_mask_shared.o -g -L. -l:libsmctrl.a $(LDFLAGS) 40 $(NVCC) -ccbin $(CXX) $@.c -o $@ libsmctrl_test_mask_shared.o -g -L. -l:libsmctrl.a $(LDFLAGS)
34 41
35libsmctrl_test_global_mask: libsmctrl_test_global_mask.c libsmctrl.a libsmctrl_test_mask_shared.o 42libsmctrl_test_global_mask: libsmctrl_test_global_mask.c libsmctrl.a libsmctrl_test_mask_shared.o
@@ -52,7 +59,7 @@ tests: libsmctrl_test_gpc_info libsmctrl_test_supreme_mask \
52 libsmctrl_test_stream_mask_override libsmctrl_test_next_mask \ 59 libsmctrl_test_stream_mask_override libsmctrl_test_next_mask \
53 libsmctrl_test_next_mask_override 60 libsmctrl_test_next_mask_override
54 61
55all: libsmctrl.so nvtaskset tests 62all: libsmctrl.so libcuda.so.1 nvtaskset tests
56 63
57clean: 64clean:
58 rm -f libsmctrl.so libsmctrl.o libsmctrl.a libsmctrl_test_gpc_info \ 65 rm -f libsmctrl.so libsmctrl.o libsmctrl.a libsmctrl_test_gpc_info \
@@ -60,4 +67,32 @@ clean:
60 libsmctrl_test_global_mask \ 67 libsmctrl_test_global_mask \
61 libsmctrl_test_stream_mask libsmctrl_test_stream_mask_override \ 68 libsmctrl_test_stream_mask libsmctrl_test_stream_mask_override \
62 libsmctrl_test_next_mask libsmctrl_test_next_mask_override \ 69 libsmctrl_test_next_mask libsmctrl_test_next_mask_override \
63 nvtaskset 70 nvtaskset libcuda.so.1
71
72install: libcuda.so.1
73 @# Check that CUDA is installed first
74 test -f /lib/$(ARCH)/libcuda.so.*.*
75 @# Change libcuda.so link to bypass libcuda.so.1
76 sudo ln -sf /lib/$(ARCH)/libcuda.so.*.* /lib/$(ARCH)/libcuda.so
77 @# Remove libcuda.so.1 symlink
78 sudo rm /lib/$(ARCH)/libcuda.so.1
79 @# Install wrapper as libcuda.so.1
80 sudo cp libcuda.so.1 /lib/$(ARCH)/libcuda.so.1
81
82remove:
83 @# Test that our library in installed first
84 test ! -L /lib/$(ARCH)/libcuda.so.1
85 @# Overwrite install with original symlinks
86 sudo ln -sf libcuda.so.1 /lib/$(ARCH)/libcuda.so
87 sudo ln -sf /lib/$(ARCH)/libcuda.so.*.* /lib/$(ARCH)/libcuda.so.1
88
89run_tests: tests
90 ./libsmctrl_test_global_mask
91 ./libsmctrl_test_next_mask
92 ./libsmctrl_test_stream_mask
93 ./libsmctrl_test_next_mask_override
94 ./libsmctrl_test_stream_mask_override
95 @# Must set LD_LIBRARY_PATH in case make install has not been run
96 LD_LIBRARY_PATH=. ./libsmctrl_test_supreme_mask
97 ./libsmctrl_test_gpc_info
98 @ echo "All tests passed!"
diff --git a/README.md b/README.md
index f2be718..11ad153 100644
--- a/README.md
+++ b/README.md
@@ -13,16 +13,37 @@ Please cite this paper in any work which leverages our library. Here's the BibTe
13 year={2023}, 13 year={2023},
14 month={May}, 14 month={May},
15 pages={54--66}, 15 pages={54--66},
16 doi={10.1109/RTAS58335.2023.00012},
16 _series={RTAS} 17 _series={RTAS}
17} 18}
18``` 19```
19 20
20Please see [the paper](https://www.cs.unc.edu/~jbakita/rtas23.pdf) and `libsmctrl.h` for details and examples of how to use this library. 21The ability for `libsmctrl` to work on unmodified tasks was developed as part of a follow-up paper:
22
23_J. Bakita and J. H. Anderson, "Hardware Compute Partitioning on NVIDIA GPUs for Composable Systems", Proceedings of the 37th Euromicro Conference on Real-Time Systems, pp. 18:1-18:24, July 2025._
24
25Please cite this paper in any work which uses this for partitioning unmodified tasks. Here's the BibTeX entry:
26```
27@inproceedings{bakita2025hardware,
28 title={Hardware Compute Partitioning on {NVIDIA} {GPUs} for Composable Systems},
29 author={Bakita, Joshua and Anderson, James H},
30 booktitle={Proceedings of the 37th Euromicro Conference on Real-Time Systems},
31 year={2025},
32 month={July},
33 pages={18:1--18:24},
34 doi={10.1109/ECRTS.2025.18},
35 _series={ECRTS}
36}
37```
38
39Please see [the first paper](https://www.cs.unc.edu/~jbakita/rtas23.pdf), [the second paper](https://www.cs.unc.edu/~jbakita/ecrts25.pdf) and `libsmctrl.h` for details and examples of how to use this library.
21We strongly encourage consulting those resources first; the below comments serve merely as an appendum. 40We strongly encourage consulting those resources first; the below comments serve merely as an appendum.
22 41
23## Run-time Dependencies 42## Run-time Dependencies
24`libcuda.so`, which is automatically installed by the NVIDIA GPU driver. 43`libcuda.so`, which is automatically installed by the NVIDIA GPU driver.
25 44
45(Technically `libdl` is also required, but this should never need to be manually installed. This is a dependency of CUDA, and is also part of the GNU C Standard Library starting with version 2.34.)
46
26## Building 47## Building
27To build, ensure that you have `gcc` installed and access to the CUDA SDK including `nvcc`. Then run: 48To build, ensure that you have `gcc` installed and access to the CUDA SDK including `nvcc`. Then run:
28``` 49```
@@ -66,8 +87,52 @@ nvcc benchmark.cu -o benchmark -I/playpen/libsmctl -lsmctrl -lcuda -L/playpen/li
66``` 87```
67The resultant `benchmark` binary should be portable to any system with an equivalent or newer version of the NVIDIA GPU driver installed. 88The resultant `benchmark` binary should be portable to any system with an equivalent or newer version of the NVIDIA GPU driver installed.
68 89
90## Use Without Application Modification
91As an alternative to modifying your application, `libsmctrl` can be installed system-wide, and partitions for each application can be set via the `nvtaskset` tool.
92The `nvtaskset` tool works very similarly to the Linux CPU-affinity-setting tool `taskset`.
93
94To install `libsmctrl` system-wide, such that all CUDA-using applications automatically load it, ensure that `patchelf` is installed (`sudo apt install patchelf`), and run:
95```
96make libcuda.so.1 install
97```
98Or, if you do not want to modify any system-wide state, and only want `libsmctrl` loaded as part of anything run from this console:
99```
100make libcuda.so.1
101export LD_LIBRARY_PATH=$(pwd)
102```
103(This works because CUDA is always dynamically loaded from `libcuda.so.1`, and `lbsmctrl` creates a "fake" `libcuda.so.1` in this directory that wraps CUDA.
104 Setting `LD_LIBRARY_PATH` ensures that the wrapped version is the first one loaded.
105 The only difference with running `make install` is that it copies our "fake" `libcuda.so.1` to a location where the loader will automatically find it.)
106
107And then to start an application within a specific TPC partition, e.g., the first 10 TPCs:
108```
109./nvtaskset -t 0-9 my_program my_args
110```
111Note that this will automatically start NVIDIA MPS, which is a prerequisite to co-run tasks on NVIDIA GPUs without timeslicing.
112
113And to change the TPCs available for a process ID 1234 to to the first 10 TPCs:
114```
115./nvtaskset -tp 0-9 1234
116```
117
118Or, to change a process of ID 1234 to only run on GPC 3:
119```
120./nvtaskset -gp 3 1234
121```
122
123To remove the system-wide installation of `libsmctrl`, run:
124```
125make remove
126```
127
69## Run Tests 128## Run Tests
70To test partitioning: 129
130To run them all:
131```
132make run_tests
133```
134
135If you prefer to run them individually, to test partitioning:
71``` 136```
72make tests 137make tests
73./libsmctrl_test_global_mask 138./libsmctrl_test_global_mask
@@ -82,18 +147,26 @@ make tests
82./libsmctrl_test_next_mask_override 147./libsmctrl_test_next_mask_override
83``` 148```
84 149
85And if `nvdebug` has been installed: 150To test that `nvtaskset` can dynamically change the mask of a running program:
86``` 151```
87make tests 152make libsmctrl_test_supreme_mask
153./libsmctrl_test_supreme_mask
154```
155
156To test that TPC to GPC mappings can be obtained (if `nvdebug` has been installed):
157```
158make libsmctrl_test_gpc_info
88./libsmctrl_test_gpc_info 159./libsmctrl_test_gpc_info
89``` 160```
90 161
162The `CUDA_VISIBLE_DEVICES` environment variable can be set to run any of the partitioning tests on a different GPU.
163
91## Supported GPUs 164## Supported GPUs
92 165
93#### Known Working 166#### Known Working
94 167
95- NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs 168- NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs
96- CUDA 6.5 through 12.6 169- CUDA 6.5 through 12.8
97- `x86_64` and Jetson `aarch64` platforms 170- `x86_64` and Jetson `aarch64` platforms
98 171
99#### Known Issues 172#### Known Issues
diff --git a/libsmctrl.c b/libsmctrl.c
index 6aa471b..79d2b33 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -17,22 +17,27 @@
17 * Please contact the authors if support is needed for a particular feature on 17 * Please contact the authors if support is needed for a particular feature on
18 * an older CUDA version. Support for those is unimplemented, not impossible. 18 * an older CUDA version. Support for those is unimplemented, not impossible.
19 * 19 *
20 * An old implementation of this file effected the global mask on CUDA 10.2 by 20 * An old implementation of this file affected the global mask on CUDA 10.2 by
21 * changing a field in CUDA's global struct that CUDA applies to the QMD/TMD. 21 * changing a field in CUDA's global struct that CUDA applies to the QMD/TMD.
22 * That implementation was extraordinarily complicated, and was replaced in 22 * That implementation was extraordinarily complicated, and was replaced in
23 * 2024 with a more-backward-compatible way of hooking the TMD/QMD. 23 * 2024 with a more-backward-compatible way of hooking the TMD/QMD.
24 * View the old implementation via Git: `git show aa63a02e:libsmctrl.c`. 24 * View the old implementation via Git: `git show aa63a02e:libsmctrl.c`.
25 */ 25 */
26#define _GNU_SOURCE // To enable use of memfd_create()
26#include <cuda.h> 27#include <cuda.h>
27 28
28#include <errno.h> 29#include <errno.h>
29#include <error.h> 30#include <error.h>
31#include <dlfcn.h>
30#include <fcntl.h> 32#include <fcntl.h>
31#include <stdbool.h> 33#include <stdbool.h>
32#include <stdint.h> 34#include <stdint.h>
33#include <stdio.h> 35#include <stdio.h>
34#include <sys/ipc.h> 36#include <string.h>
35#include <sys/shm.h> 37#include <sys/mman.h>
38#include <sys/socket.h>
39#include <sys/types.h>
40#include <sys/un.h>
36#include <unistd.h> 41#include <unistd.h>
37 42
38#include "libsmctrl.h" 43#include "libsmctrl.h"
@@ -48,20 +53,33 @@
48// (No testing attempted on pre-CUDA-6.5 versions) 53// (No testing attempted on pre-CUDA-6.5 versions)
49// Values for the following three lines can be extracted by tracing CUPTI as 54// Values for the following three lines can be extracted by tracing CUPTI as
50// it interects with libcuda.so to set callbacks. 55// it interects with libcuda.so to set callbacks.
51static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; 56static const CUuuid callback_funcs_id = {{0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}};
52// These callback descriptors appear to intercept the TMD/QMD late enough that 57// These callback descriptors appear to intercept the TMD/QMD late enough that
53// CUDA has already applied the per-stream mask from its internal data 58// CUDA has already applied the per-stream mask from its internal data
54// structures, allowing us to override it with the next mask. 59// structures, allowing us to override it with the next mask.
55#define QMD_DOMAIN 0xb 60#define QMD_DOMAIN 0xb
56#define QMD_PRE_UPLOAD 0x1 61#define QMD_PRE_UPLOAD 0x1
62/**
63 * These globals must be non-static (i.e., have global linkage) to ensure that
64 * if multiple copies of the library are loaded (e.g., dynamically linked to
65 * both this program and a dependency), secondary copies do not attempt to
66 * repeat initialization or make changes to unused copies of mask values.
67 */
57// Supreme mask (cannot be overridden) 68// Supreme mask (cannot be overridden)
58static uint64_t *g_supreme_sm_mask = NULL; 69uint128_t *g_supreme_sm_mask = NULL;
59// Global mask (applies across all threads) 70// Global mask (applies across all threads)
60static uint64_t g_sm_mask = 0; 71uint64_t g_sm_mask = 0;
61// Next mask (applies per-thread) 72// Next mask (applies per-thread)
62static __thread uint64_t g_next_sm_mask = 0; 73__thread uint64_t g_next_sm_mask = 0;
63// Flag value to indicate if setup has been completed 74// Flag value to indicate if setup has been completed
64static bool sm_control_setup_called = false; 75bool sm_control_setup_called = false;
76
77#ifdef LIBSMCTRL_STATIC
78// Special handling for if built as a static library, and the libcuda.so.1
79// libsmctrl wrapper is in use (see comment on setup() constructor for detail).
80static void (*shared_set_global_mask)(uint64_t) = NULL;
81static void (*shared_set_next_mask)(uint64_t) = NULL;
82#endif
65 83
66// v1 has been removed---it intercepted the TMD/QMD too early, making it 84// v1 has been removed---it intercepted the TMD/QMD too early, making it
67// impossible to override the CUDA-injected stream mask with the next mask. 85// impossible to override the CUDA-injected stream mask with the next mask.
@@ -78,7 +96,7 @@ static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in
78 if (!tmd) 96 if (!tmd)
79 abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n"); 97 abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n");
80 98
81 uint32_t *lower_ptr, *upper_ptr; 99 uint32_t *lower_ptr, *upper_ptr, *ext_lower_ptr, *ext_upper_ptr;
82 100
83 // The location of the TMD version field seems consistent across versions 101 // The location of the TMD version field seems consistent across versions
84 uint8_t tmd_ver = *(uint8_t*)(tmd + 72); 102 uint8_t tmd_ver = *(uint8_t*)(tmd + 72);
@@ -87,10 +105,12 @@ static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in
87 // TMD V04_00 is used starting with Hopper to support masking >64 TPCs 105 // TMD V04_00 is used starting with Hopper to support masking >64 TPCs
88 lower_ptr = tmd + 304; 106 lower_ptr = tmd + 304;
89 upper_ptr = tmd + 308; 107 upper_ptr = tmd + 308;
108 ext_lower_ptr = tmd + 312;
109 ext_upper_ptr = tmd + 316;
90 // XXX: Disable upper 64 TPCs until we have ...next_mask_ext and 110 // XXX: Disable upper 64 TPCs until we have ...next_mask_ext and
91 // ...global_mask_ext 111 // ...global_mask_ext
92 *(uint32_t*)(tmd + 312) = -1; 112 *ext_lower_ptr = -1;
93 *(uint32_t*)(tmd + 316) = -1; 113 *ext_upper_ptr = -1;
94 // An enable bit is also required 114 // An enable bit is also required
95 *(uint32_t*)tmd |= 0x80000000; 115 *(uint32_t*)tmd |= 0x80000000;
96 } else if (tmd_ver >= 0x16) { 116 } else if (tmd_ver >= 0x16) {
@@ -119,6 +139,10 @@ static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in
119 if (g_supreme_sm_mask) { 139 if (g_supreme_sm_mask) {
120 *lower_ptr |= (uint32_t)*g_supreme_sm_mask; 140 *lower_ptr |= (uint32_t)*g_supreme_sm_mask;
121 *upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 32); 141 *upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 32);
142 if (tmd_ver >= 0x40) {
143 *ext_lower_ptr |= (uint32_t)(*g_supreme_sm_mask >> 64);
144 *ext_upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 96);
145 }
122 } 146 }
123 147
124 //fprintf(stderr, "Final SM Mask (lower): %x\n", *lower_ptr); 148 //fprintf(stderr, "Final SM Mask (lower): %x\n", *lower_ptr);
@@ -163,12 +187,26 @@ static void setup_sm_control_callback() {
163 187
164// Set default mask for all launches 188// Set default mask for all launches
165void libsmctrl_set_global_mask(uint64_t mask) { 189void libsmctrl_set_global_mask(uint64_t mask) {
190#ifdef LIBSMCTRL_STATIC
191 // Special handling for if built as a static library, and the libcuda.so.1
192 // libsmctrl wrapper is in use (see comment on setup() constructor for
193 // detail).
194 if (shared_set_global_mask)
195 return (*shared_set_global_mask)(mask);
196#endif
166 setup_sm_control_callback(); 197 setup_sm_control_callback();
167 g_sm_mask = mask; 198 g_sm_mask = mask;
168} 199}
169 200
170// Set mask for next launch from this thread 201// Set mask for next launch from this thread
171void libsmctrl_set_next_mask(uint64_t mask) { 202void libsmctrl_set_next_mask(uint64_t mask) {
203#ifdef LIBSMCTRL_STATIC
204 // Special handling for if built as a static library, and the libcuda.so.1
205 // libsmctrl wrapper is in use (see comment on setup() constructor for
206 // detail).
207 if (shared_set_next_mask)
208 return (*shared_set_next_mask)(mask);
209#endif
172 setup_sm_control_callback(); 210 setup_sm_control_callback();
173 g_next_sm_mask = mask; 211 g_next_sm_mask = mask;
174} 212}
@@ -248,7 +286,7 @@ struct stream_sm_mask_v2 {
248// (CUDA 9.0 behaves slightly different on this platform.) 286// (CUDA 9.0 behaves slightly different on this platform.)
249// @return 1 if detected, 0 if not, -cuda_err on error 287// @return 1 if detected, 0 if not, -cuda_err on error
250#if __aarch64__ 288#if __aarch64__
251int detect_parker_soc() { 289static int detect_parker_soc() {
252 int cap_major, cap_minor, err, dev_count; 290 int cap_major, cap_minor, err, dev_count;
253 if (err = cuDeviceGetCount(&dev_count)) 291 if (err = cuDeviceGetCount(&dev_count))
254 return -err; 292 return -err;
@@ -272,7 +310,7 @@ int detect_parker_soc() {
272} 310}
273#endif // __aarch64__ 311#endif // __aarch64__
274 312
275// Should work for CUDA 8.0 through 12.6 313// Should work for CUDA 8.0 through 12.8
276// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in 314// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in
277// our header 315// our header
278void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { 316void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
@@ -417,7 +455,8 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
417 } 455 }
418} 456}
419 457
420/* INFORMATIONAL FUNCTIONS */ 458
459/*** TPC and GPU Informational Functions ***/
421 460
422// Read an integer from a file in `/proc` 461// Read an integer from a file in `/proc`
423static int read_int_procfile(char* filename, uint64_t* out) { 462static int read_int_procfile(char* filename, uint64_t* out) {
@@ -590,32 +629,98 @@ abort_cuda:
590 return EIO; 629 return EIO;
591} 630}
592 631
632
633/*** Private functions for nvtaskset and building as a libcuda.so.1 wrapper ***/
634
635// Check if NVIDIA MPS is running, following the process that `strace` shows
636// `nvidia-cuda-mps-control` to use. MPS is a prerequisite to co-running
637// multiple GPU-using tasks without timeslicing.
638bool libsmctrl_is_mps_running() {
639 char *mps_pipe_dir;
640 int mps_ctrl;
641 struct sockaddr_un mps_ctrl_addr;
642 mps_ctrl_addr.sun_family = AF_UNIX;
643 const int yes = 1;
644
645 if (!(mps_pipe_dir = getenv("CUDA_MPS_PIPE_DIRECTORY")))
646 mps_pipe_dir = "/tmp/nvidia-mps";
647 // Pipe names are limited to 108 characters long
648 snprintf(mps_ctrl_addr.sun_path, 108, "%s/control", mps_pipe_dir);
649 // This mirrors the process `nvidia-cuda-mps-control` uses to detect MPS
650 if ((mps_ctrl = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1)
651 return false;
652 if (setsockopt(mps_ctrl, SOL_SOCKET, SO_PASSCRED, &yes, sizeof(yes)) == -1)
653 return false;
654 if (connect(mps_ctrl, &mps_ctrl_addr, sizeof(struct sockaddr_un)) == -1)
655 return false;
656 close(mps_ctrl);
657 return true;
658}
659
660// A variant of strtoul with support for 128-bit integers
661uint128_t strtou128(const char *nptr, char **endptr, int base) {
662 unsigned __int128 result = 0;
663 if (base != 16)
664 error(1, EINVAL, "strtou128 only supports base 16");
665 // Skip a "0x" prefix. Safe due to early evaluation
666 if (*nptr == '0' && (*(nptr + 1) == 'x' || *(nptr + 1) == 'X'))
667 nptr += 2;
668 // Until hitting an invalid character
669 while (1) {
670 if (*nptr >= 'a' && *nptr <= 'f')
671 result = result << 4 | (*nptr - 'a' + 10);
672 else if (*nptr >= 'A' && *nptr <= 'F')
673 result = result << 4 | (*nptr - 'A' + 10);
674 else if (*nptr >= '0' && *nptr <= '9')
675 result = result << 4 | (*nptr - '0');
676 else
677 break;
678 nptr++;
679 }
680 if (endptr)
681 *endptr = (char*)nptr;
682 return result;
683}
684
685#ifdef LIBSMCTRL_WRAPPER
686// The CUDA runtime library uses dlopen() to load CUDA functions from
687// libcuda.so.1. Since we replace that with our wrapper library, we need to
688// also redirect any attempted opens of that shared object to the actual
689// shared library, which is linked to by libcuda.so.
690void *dlopen(const char *filename, int flags) {
691 if (filename && strcmp(filename, "libcuda.so") == 0) {
692 fprintf(stderr, "redirecting dlopen of %s to libcuda.so\n", filename);
693 // A GNU-only dlopen variant
694 return dlmopen(LM_ID_BASE, "libcuda.so", flags);
695 } else
696 return dlmopen(LM_ID_BASE, filename, flags);
697}
698
593// Allow setting a default mask via an environment variable 699// Allow setting a default mask via an environment variable
594// Also enables libsmctrl to be used on unmodified programs via setting: 700// Also enables libsmctrl to be used on unmodified programs via setting:
595// LD_PRELOAD=libsmctrl.so LIBSMCTRL_MASK=<your mask> ./my_program 701// LD_LIBRARY_PATH=libsmctrl LIBSMCTRL_MASK=<your mask> ./my_program
596// Where "<your mask>" is replaced with a disable mask, optionally prefixed 702// Where "<your mask>" is replaced with a disable mask, optionally prefixed
597// with a ~ to invert it (make it an enable mask). 703// with a ~ to invert it (make it an enable mask).
598__attribute__((constructor)) static void setup(void) { 704__attribute__((constructor)) static void setup(void) {
599 char *end, *mask_str; 705 char *end, *mask_str;
600 // If dynamic changes are disabled (due to an error) this variable is 706 // If dynamic changes are disabled (due to an error) this variable is
601 // permanently used to store the supreme mask, rather than the SysV shared 707 // permanently used to store the supreme mask, rather than the shared
602 // memory segment. 708 // memory segment.
603 static uint64_t mask; 709 static uint128_t mask;
604 bool invert = false; 710 bool invert = false;
605 int shmid;
606 key_t shm_key;
607 711
608 mask_str = getenv("LIBSMCTRL_MASK"); 712 mask_str = getenv("LIBSMCTRL_MASK");
713
714 // Assume no mask if unspecified
609 if (!mask_str) 715 if (!mask_str)
610 return; 716 mask_str = "0";
611 717
612 if (*mask_str == '~') { 718 if (*mask_str == '~') {
613 invert = true; 719 invert = true;
614 mask_str++; 720 mask_str++;
615 } 721 }
616 722
617 // XXX: Doesn't support 128-bit masks 723 mask = strtou128(mask_str, &end, 16);
618 mask = strtoull(mask_str, &end, 0);
619 // Verify we were able to parse the whole string 724 // Verify we were able to parse the whole string
620 if (*end != '\0') 725 if (*end != '\0')
621 abort(1, EINVAL, "Unable to apply default mask"); 726 abort(1, EINVAL, "Unable to apply default mask");
@@ -623,35 +728,64 @@ __attribute__((constructor)) static void setup(void) {
623 if (invert) 728 if (invert)
624 mask = ~mask; 729 mask = ~mask;
625 730
731 // Explictly set the number of channels (if unset), otherwise CUDA will only
732 // use two with MPS (see paper for why that causes problems)
733 if (setenv("CUDA_DEVICE_MAX_CONNECTIONS", "8", 0) == -1)
734 abort(1, EINVAL, "Unable to configure environment");
735
736 // Warn if a mask was specified but MPS isn't running
737 if (mask && !libsmctrl_is_mps_running())
738 fprintf(stderr, "libsmctrl-libcuda-wrapper: Warning: TPC mask set via LIBSMCTRL_MASK, but NVIDIA MPS is not running. CUDA programs will not co-run!\n");
739
626 // Initialize CUDA and the interception callback 740 // Initialize CUDA and the interception callback
627 setup_sm_control_callback(); 741 setup_sm_control_callback();
628 742
629 // TODO: Switch to memfd_create(); this leaks IPC objects 743 // Create shared memory region for the supreme mask such that nvtaskset
630 // Create a SysV IPC key (32 bits) to identify our shared memory region 744 // can read and modify it
631 // Use the pid as the top 16 bits, and "sm" as the bottom 16 745 int fd = memfd_create("libsmctrl", MFD_CLOEXEC);
632 shm_key = getpid(); 746 if (fd == -1) {
633 shm_key <<= 16;
634 shm_key |= (int)'s' << 8 | (int)'m';
635 // Obtain or create a 128-bit (16-byte) shared memory region
636 shmid = shmget(shm_key, 16, IPC_CREAT | 0600);
637 if (shmid == -1) {
638 abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled"); 747 abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled");
639 g_supreme_sm_mask = &mask; 748 g_supreme_sm_mask = &mask;
640 return; 749 return;
641 } 750 }
642 // Open the shared memory region 751 if (ftruncate(fd, 16) == -1) {
643 g_supreme_sm_mask = shmat(shmid, NULL, 0); 752 abort(0, errno, "Unable to resize shared memory for dynamic partition changes. Dynamic changes disabled");
644 if (g_supreme_sm_mask == (void*)-1) { 753 g_supreme_sm_mask = &mask;
645 abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled"); 754 return;
755 }
756 if ((g_supreme_sm_mask = mmap(NULL, 16, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)) == MAP_FAILED) {
757 abort(0, errno, "Unable to map shared memory for dynamic partition changes. Dynamic changes disabled");
646 g_supreme_sm_mask = &mask; 758 g_supreme_sm_mask = &mask;
647 return; 759 return;
648 } 760 }
649 // XXX: This makes the region unopenable to everyone else. Switch to memfd!
650 // Mark the shared memory region for deletion (after we terminate)
651 if (shmctl(shmid, IPC_RMID, NULL) == -1)
652 abort(0, errno, "Unable to mark shared memory for dynamic partition changes for deletion on process termination. Will leak one page of memory.");
653 761
654 // Set the super-global mask which cannot be overwritten by any libsmctrl 762 // Set the super-global mask which cannot be overwritten by any libsmctrl
655 // API function. 763 // API function.
656 *g_supreme_sm_mask = mask; 764 *g_supreme_sm_mask = mask;
657} 765}
766#elif defined(LIBSMCTRL_STATIC)
767// If this library is statically built into a program, and the libcuda.so.1
768// wrapper is enabled, we force the staticlly linked version of the library
769// to defer to the function implementations in the wrapper.
770//
771// Longer explanation:
772// If the library has been dynamically linked into a program and the wrapper
773// is in use, the loader will point both to the same set of symbols (since both
774// will do a dynamic lookup at load-time, the global state at the top of this
775// file uses global linkage, and will thus be in the dynamic symbol table, and
776// each lookup will find the same copy.)
777// Symbols from a staticlly linked library are not included in the dynamic
778// symbol table, and thus can exist in duplicate of those in any shared
779// library. This is a problem, since only one callback function, using one set
780// of global variables can be registered with CUDA. We work around this by
781// having our statically linked library use the functions from the wrapper or
782// any shared library, if one such instance is loaded.
783__attribute__((constructor)) static void setup(void) {
784 // dlsym can only view the dynamic symbol tables, and so these lookups will
785 // fail if neither the wrapper (libcuda.so.1) nor libsmctrl.so are loaded.
786 // (That indicates that we should the static library implementations.)
787 // These are a NOP on failure since they return NULL when not found.
788 shared_set_next_mask = dlsym(RTLD_DEFAULT, "libsmctrl_set_next_mask");
789 shared_set_global_mask = dlsym(RTLD_DEFAULT, "libsmctrl_set_global_mask");
790}
791#endif
diff --git a/libsmctrl_test_gpc_info.c b/libsmctrl_test_gpc_info.c
index 558b80a..afa0876 100644
--- a/libsmctrl_test_gpc_info.c
+++ b/libsmctrl_test_gpc_info.c
@@ -38,6 +38,8 @@ int main(int argc, char** argv) {
38 gpu_id = 0; 38 gpu_id = 0;
39 // Tell CUDA to use PCI device id ordering (to match nvdebug) 39 // Tell CUDA to use PCI device id ordering (to match nvdebug)
40 putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID"); 40 putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID");
41 // Allow CUDA to see all devices (to better match nvdebug)
42 unsetenv("CUDA_VISIBLE_DEVICES");
41 // A CUDA context is required before reading the topology information 43 // A CUDA context is required before reading the topology information
42 if ((res = cuInit(0))) { 44 if ((res = cuInit(0))) {
43 const char* name; 45 const char* name;
@@ -45,7 +47,7 @@ int main(int argc, char** argv) {
45 fprintf(stderr, "%s: Unable to initialize CUDA, error %s\n", program_invocation_name, name); 47 fprintf(stderr, "%s: Unable to initialize CUDA, error %s\n", program_invocation_name, name);
46 return 1; 48 return 1;
47 } 49 }
48 if ((res = cuCtxCreate(&ctx, 0, 0))) { 50 if ((res = cuCtxCreate(&ctx, 0, gpu_id))) {
49 const char* name; 51 const char* name;
50 cuGetErrorName(res, &name); 52 cuGetErrorName(res, &name);
51 fprintf(stderr, "%s: Unable to create a CUDA context, error %s\n", program_invocation_name, name); 53 fprintf(stderr, "%s: Unable to create a CUDA context, error %s\n", program_invocation_name, name);
diff --git a/libsmctrl_test_mask_shared.cu b/libsmctrl_test_mask_shared.cu
index 3b7ebcd..8d2bd79 100644
--- a/libsmctrl_test_mask_shared.cu
+++ b/libsmctrl_test_mask_shared.cu
@@ -18,9 +18,8 @@ __global__ void read_and_store_smid(uint8_t* smid_arr) {
18 smid_arr[blockIdx.x] = smid; 18 smid_arr[blockIdx.x] = smid;
19} 19}
20 20
21// Assuming SMs continue to support a maximum of 2048 resident threads, six 21// Need at least as many blocks as there are SMs on NVIDIA's biggest GPUs
22// blocks of 1024 threads should span at least three SMs without partitioning 22#define NUM_BLOCKS 142
23#define NUM_BLOCKS 142 //6
24 23
25static int sort_asc(const void* a, const void* b) { 24static int sort_asc(const void* a, const void* b) {
26 return *(uint8_t*)a - *(uint8_t*)b; 25 return *(uint8_t*)a - *(uint8_t*)b;
@@ -83,8 +82,11 @@ int test_constrained_size_and_location(enum partitioning_type part_type) {
83 // Apply partitioning to enable only the first TPC of each 32-bit block 82 // Apply partitioning to enable only the first TPC of each 32-bit block
84 switch (part_type) { 83 switch (part_type) {
85 case PARTITION_SUPREME: 84 case PARTITION_SUPREME:
86 printf("%s: Please set mask to '0x%016lx%016lx' for PID %d using the control deamon and press any key to continue...\n", program_invocation_name, (uint64_t)(mask >> 64), (uint64_t)mask, getpid()); 85 char cmd[80];
87 fgetc(stdin); 86 // We must invert the mask before passing it to nvtaskset, since
87 // nvtaskset takes an enable mask (as with the taskset command)
88 snprintf(cmd, 80, "./nvtaskset -p 0x%.0lx%016lx %d > /dev/null", ~(uint64_t)(mask >> 64), ~(uint64_t)mask, getpid());
89 system(cmd);
88 break; 90 break;
89 case PARTITION_GLOBAL: 91 case PARTITION_GLOBAL:
90 libsmctrl_set_global_mask(mask); 92 libsmctrl_set_global_mask(mask);
@@ -120,10 +122,9 @@ int test_constrained_size_and_location(enum partitioning_type part_type) {
120 uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS); // Sorts too 122 uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS); // Sorts too
121 if (uniq_partitioned > sms_per_tpc) { 123 if (uniq_partitioned > sms_per_tpc) {
122 printf("%s: ***Test failure.***\n" 124 printf("%s: ***Test failure.***\n"
123 "%s: Reason: With TPC mask set to " 125 "%s: Reason: With a partition of only one TPC, the test kernel "
124 "constrain all kernels to a single TPC, a kernel of %d blocks of " 126 "of %d blocks of 1024 threads ran on %d SMs (at most %d---one "
125 "1024 threads was launched and found to run on %d SMs (at most %d---" 127 "TPC---expected).\n", program_invocation_name, program_invocation_name, NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
126 "one TPC---expected).\n", program_invocation_name, program_invocation_name, NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
127 return 1; 128 return 1;
128 } 129 }
129 130
@@ -131,18 +132,16 @@ int test_constrained_size_and_location(enum partitioning_type part_type) {
131 if (smids_partitioned_h[NUM_BLOCKS - 1] > (enabled_tpc * sms_per_tpc) + sms_per_tpc - 1 || 132 if (smids_partitioned_h[NUM_BLOCKS - 1] > (enabled_tpc * sms_per_tpc) + sms_per_tpc - 1 ||
132 smids_partitioned_h[NUM_BLOCKS - 1] < (enabled_tpc * sms_per_tpc)) { 133 smids_partitioned_h[NUM_BLOCKS - 1] < (enabled_tpc * sms_per_tpc)) {
133 printf("%s: ***Test failure.***\n" 134 printf("%s: ***Test failure.***\n"
134 "%s: Reason: With TPC mask set to " 135 "%s: Reason: With a partition of only TPC %d, the test kernel "
135 "constrain all kernels to TPC %d, a kernel was run and found " 136 "ran on SM IDs as high as %d and as low as %d (range of %d to %d "
136 "to run on an SM IDs: as high as %d and as low as %d (range of %d to %d expected).\n", 137 "expected).\n", program_invocation_name, program_invocation_name, enabled_tpc, smids_partitioned_h[NUM_BLOCKS - 1], smids_partitioned_h[0], enabled_tpc * sms_per_tpc + sms_per_tpc - 1, enabled_tpc * sms_per_tpc);
137 program_invocation_name, program_invocation_name, enabled_tpc, smids_partitioned_h[NUM_BLOCKS - 1], smids_partitioned_h[0], enabled_tpc * sms_per_tpc + sms_per_tpc - 1, enabled_tpc * sms_per_tpc);
138 return 1; 138 return 1;
139 } 139 }
140 140
141 // Div by 32 via a shift 141 // Div by 32 via a shift
142 asprintf(&reason[enabled_tpc >> 5], 142 asprintf(&reason[enabled_tpc >> 5],
143 "With a partition enabled which " 143 "With a partition of only TPC %d, the test kernel used only %d "
144 "contained only TPC ID %d, the test kernel was found to use only %d " 144 "SMs (%d without), and all had IDs between %d and %d (were contained"
145 "SMs (%d without), and all SMs in-use had IDs between %d and %d (were contained"
146 " in TPC %d).", enabled_tpc, uniq_partitioned, uniq_native, smids_partitioned_h[0], smids_partitioned_h[NUM_BLOCKS - 1], enabled_tpc); 145 " in TPC %d).", enabled_tpc, uniq_partitioned, uniq_native, smids_partitioned_h[0], smids_partitioned_h[NUM_BLOCKS - 1], enabled_tpc);
147 } 146 }
148 147
diff --git a/nvtaskset.c b/nvtaskset.c
index 74f88d7..4901cbe 100644
--- a/nvtaskset.c
+++ b/nvtaskset.c
@@ -1,210 +1,384 @@
1// Copyright 2025 Joshua Bakita 1// Copyright 2025 Joshua Bakita
2// taskset-like utility for the GPU 2// Show or change the GPU core affinity for a CUDA process
3// taskset-like utility for NVIDIA GPUs
3#define _GNU_SOURCE // For program_invocation_name 4#define _GNU_SOURCE // For program_invocation_name
4#include <argp.h> 5#include <argp.h>
6#include <dirent.h>
5#include <errno.h> 7#include <errno.h>
6#include <error.h> 8#include <error.h>
9#include <fcntl.h>
10#include <stdbool.h>
7#include <stdint.h> 11#include <stdint.h>
8#include <stdio.h> 12#include <stdio.h>
9#include <stdlib.h> 13#include <stdlib.h>
10#include <string.h> 14#include <string.h>
11#include <sys/ipc.h> 15#include <sys/mman.h>
12#include <sys/shm.h>
13#include <sys/types.h>
14#include <unistd.h> 16#include <unistd.h>
15 17
16#include <cuda.h> // To help with getting GPC info 18#include <cuda.h> // To help with getting GPC info
17 19
18#include "libsmctrl.h" 20#include "libsmctrl.h"
19 21
20const char* maintainer = "<jbakita@cs.unc.edu>"; 22#define LINK_NAME "/memfd:libsmctrl"
21const char* version = "nvtaskset 2025.03"; 23
22const char* desc = "taskset-like utility for NVIDIA GPUs."; 24// TODO: Write automated tests:
25// - Change region of non-existent PID
26// - Change region of permission denied PID
27// - Change region of non-GPU PID
28// - Change GPC list
29// - Change TPC list
30// - Change TPC mask
31// - Start TPC mask
32// - Start TPC list
33// - Start GPC list
34// - Start with subargument containing -
35// - Query GPC list
36// - Query TPC list
37// - Query TPC mask
38// - Set GPC list w/ non-existant GPC
39// - Set TPC list w/ non-existant TPC
40
41// Private symbols from libsmctrl
42extern bool libsmctrl_is_mps_running();
43extern uint128_t strtou128(const char *nptr, char **endptr, int base);
44
45const char *argp_program_bug_address = "<jbakita@cs.unc.edu>";
46const char *argp_program_version = "nvtaskset 2025.06";
47const char desc[] = "Show or change the GPU core affinity for a CUDA process\v"
48 "Warning: When using GPC lists, this tool currently "
49 "derives TPC to GPC mappings from the first NVIDIA GPU in "
50 "the system (by PCI bus ID) device. To use the mappings "
51 "for a different device, use the `libsmctrl_test_get_info` "
52 "tool to get the bitmask of TPCs associated with each GPC, "
53 "OR them, and then set that bitmask via this tool. Better "
54 "multi-GPU support is intended for a future release.\n\n"
55 "Inspired by the Linux taskset utility.";
56const char args_doc[] = "[mask | list] [pid | cmd [args...]]";
23 57
24const struct argp_option opts[] = { 58const struct argp_option opts[] = {
59 {"gpc-list", 'g', NULL, 0, "Specify partition as a list of GPCs"},
60 {"tpc-list", 't', NULL, 0, "Specify partition as a list of TPCs"},
61 {"pid", 'p', NULL, 0, "Operate on an existing PID"},
25 {0} 62 {0}
26}; 63};
27 64
28unsigned __int128 strtou128(const char *nptr, char **endptr, int base) { 65// Create a CUDA context and query the associated GPC to TPC mappings
29 unsigned __int128 result = 0; 66// Based off logic in libsmctrl_test_gpc_info
30 if (base != 16)
31 error(1, EINVAL, "Internal error");
32 // Skip a "0x" prefix. Safe due to early evaluation
33 if (*nptr == '0' && (*(nptr + 1) == 'x' || *(nptr + 1) == 'X'))
34 nptr += 2;
35 // Until hitting an invalid character
36 while (1) {
37 if (*nptr >= 'a' && *nptr <= 'f')
38 result = result << 4 | (*nptr - 'a' + 10);
39 else if (*nptr >= 'A' && *nptr <= 'F')
40 result = result << 4 | (*nptr - 'A' + 10);
41 else if (*nptr >= '0' && *nptr <= '9')
42 result = result << 4 | (*nptr - '0');
43 else
44 break;
45 nptr++;
46 }
47 if (endptr)
48 *endptr = (char*)nptr;
49 return result;
50}
51
52void libsmctrl_get_gpc_info_ext_easy(uint32_t* num_gpcs, uint128_t** masks, int gpu_id) { 67void libsmctrl_get_gpc_info_ext_easy(uint32_t* num_gpcs, uint128_t** masks, int gpu_id) {
53 int res; 68 int res;
54 CUcontext ctx; 69 CUcontext ctx;
55 // XXX: Copied from libsmctrl_test_gpc_info 70 char *old_order = NULL;
56 // Tell CUDA to use PCI device id ordering (to match nvdebug) 71 // Tell CUDA to use PCI device id ordering (to match nvdebug)
57 putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID"); 72 putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID");
58 // A CUDA context is required before reading the topology information 73 // Allow CUDA to see all devices (to better match nvdebug)
59 if ((res = cuInit(0))) { 74 if (getenv("CUDA_VISIBLE_DEVICES")) {
60 const char* name; 75 if (!(old_order = strdup(getenv("CUDA_VISIBLE_DEVICES"))))
61 cuGetErrorName(res, &name); 76 error(1, errno, "Unable to allocate environment string");
62 fprintf(stderr, "%s: Unable to initialize CUDA, error %s\n", program_invocation_name, name); 77 unsetenv("CUDA_VISIBLE_DEVICES");
63 exit(1); 78 }
64 } 79 // A CUDA context is required before reading the topology information
65 if ((res = cuCtxCreate(&ctx, 0, 0))) { 80 if ((res = cuInit(0))) {
66 const char* name; 81 const char* name;
67 cuGetErrorName(res, &name); 82 cuGetErrorName(res, &name);
68 fprintf(stderr, "%s: Unable to create a CUDA context, error %s\n", program_invocation_name, name); 83 error(1, 0, "Unable to create a initialize CUDA, error %s\n", name);
69 exit(1); 84 }
70 } 85 if ((res = cuCtxCreate(&ctx, 0, gpu_id))) {
71 // Pull topology information from libsmctrl 86 const char* name;
72 if ((res = libsmctrl_get_gpc_info_ext(num_gpcs, masks, gpu_id)) != 0) { 87 cuGetErrorName(res, &name);
73 error(0, res, "libsmctrl_get_gpc_info() failed"); 88 error(1, 0, "Unable to create a CUDA context, error %s\n", name);
74 if (res == ENOENT) 89 }
75 fprintf(stderr, "%s: Is the nvdebug kernel module loaded?\n", program_invocation_name); 90 // Pull topology information from libsmctrl
76 if (res == EIO) 91 if ((res = libsmctrl_get_gpc_info_ext(num_gpcs, masks, gpu_id)) != 0) {
77 fprintf(stderr, "%s: Is the GPU powered on, i.e., is there an active context?\n", program_invocation_name); 92 error(0, res, "libsmctrl_get_gpc_info() failed");
78 exit(1); 93 if (res == ENOENT)
79 } 94 fprintf(stderr, "%s: Is the nvdebug kernel module loaded?\n", program_invocation_name);
80 // Not copied 95 if (res == EIO)
96 fprintf(stderr, "%s: Is the GPU powered on, i.e., is there an active context?\n", program_invocation_name);
97 exit(1);
98 }
99 // Restore the environment (in case we exec() later)
81 unsetenv("CUDA_DEVICE_ORDER"); 100 unsetenv("CUDA_DEVICE_ORDER");
101 if (old_order) {
102 setenv("CUDA_VISIBLE_DEVICES", old_order, 1);
103 free(old_order);
104 }
82} 105}
83 106
84int main(int argc, char **argv) { 107int parse_list(bool use_gpcs, char* list, uint128_t *mask_out) {
85 if (argc < 3) { 108 // We support the same ranges as taskset, e.g., X,Y,Z and X,Y-Z
86 fprintf(stderr, "Usage: %s -p <hex mask> <pid>\n", argv[0]); 109 uint32_t num_xpcs = 0; // Either TPC or GPC count, i.e., "X"PC
87 fprintf(stderr, " %s <hex mask> <command> <argument...>\n", argv[0]); 110 uint128_t* masks = NULL;
88 fprintf(stderr, " %s --gpc-list <gpc list> <command> <argument...>\n", argv[0]); 111 // TODO: Allow specifying GPU ID, rather than assuming 0!
89 fprintf(stderr, " <hex mask> has a bit set for each TPC to be enabled\n"); 112 if (use_gpcs)
90 return 1; 113 libsmctrl_get_gpc_info_ext_easy(&num_xpcs, &masks, 0);
91 } 114 else
92 // TODO: Use a proper argument parser 115 libsmctrl_get_tpc_info_cuda(&num_xpcs, 0);
93 if (strcmp("-p", argv[1]) == 0) { // Setting mask on running task 116 uint128_t mask = 0;
94 char *end; 117 int range_start_xpc = -1;
95 pid_t target_pid = strtoul(argv[2], &end, 10); 118 char* start = list;
96 // strtoul stores a pointer to the first invalid character in `end` 119 int len = strlen(list);
97 if (*end != '\0') { 120 // Convert comma-seperated GPC/TPC list into a mask
98 fprintf(stderr, "Invalid character \"%c\" in PID argument.\n", *end); 121 for (int i = 0; i < len + 1; i++) {
99 return 1; 122 if (list[i] == ',' || list[i] == '\0') {
123 list[i] = '\0';
124 int xpc = atoi(start);
125 if (xpc > num_xpcs - 1)
126 error(1, EINVAL, "%s is not a valid %s ID", start, use_gpcs ? "GPC" : "TPC");
127 // Handle ranges
128 if (range_start_xpc != -1) {
129 if (range_start_xpc >= xpc)
130 error(1, EINVAL, "Malformed %s range", use_gpcs ? "GPC" : "TPC");
131 while (range_start_xpc <= xpc) {
132 if (use_gpcs)
133 mask |= masks[range_start_xpc];
134 else
135 mask |= (uint128_t)1 << range_start_xpc;
136 range_start_xpc++;
137 }
138 range_start_xpc = -1;
139 } else {
140 if (use_gpcs)
141 mask |= masks[xpc];
142 else
143 mask |= (uint128_t)1 << xpc;
144 }
145 start = list + i + 1;
100 } 146 }
101 unsigned __int128 mask = strtou128(argv[3], &end, 16); 147 // Range start
102 if (*end != '\0') { 148 if (list[i] == '-') {
103 fprintf(stderr, "Invalid character \"%c\" in mask argument.\n", *end); 149 list[i] = '\0';
104 return 1; 150 range_start_xpc = atoi(start);
151 start = list + i + 1;
105 } 152 }
106 // The shared memory lookup key is the lower 16-bits of the PID | "sm" 153 }
107 key_t shm_key = target_pid << 16 | (int)'s' << 8 | (int) 'm'; 154 *mask_out = mask;
108 // Get a handle to the 128-bit shared memory region 155 return 0;
109 int shmid = shmget(shm_key, 16, 0); 156}
110 if (shmid == -1) 157
111 error(1, errno, "Unable to find control region for PID %d", target_pid); 158// Always returns a valid string
112 // Open the shared memory region 159char* compose_list(uint128_t mask) {
113 unsigned __int128 *supreme_mask = shmat(shmid, NULL, 0); 160 // List will always be shorter than every TPC, comma-seperated
114 if (supreme_mask == (void*)-1) 161 // 128 TPCs, with 10 1-char, 90 2-char, 28 3-char, 127 commas, and 1 null
115 error(1, errno, "Unable to open control region for PID %d", target_pid); 162 static char list[10 + 90*2 + 28*3 + 128];
116 // Write the requested mask into the shared memory region 163 char* tail = list;
117 *supreme_mask = mask; 164 int last_enabled = -2;
118 } else { // Starting a new task with a mask 165 bool in_range;
119 // TODO: Check other locations for nvidia-cuda-mps-control if its not on the path 166 for (int i = 0; i < 128; i++) {
120 // TODO: Use dup2() to redirect MPS startup messages 167 bool enabled = (mask >> i) & 1;
121 int ret = system("echo -n | nvidia-cuda-mps-control"); 168 if (in_range) {
122 if (ret == -1) 169 if (enabled) {
123 error(1, errno, "Unable to run subshell to check MPS status"); 170 last_enabled = i;
124 if (ret != 0) { // Control deamon not yet started 171 } else {
125 fprintf(stderr, "nvtaskset: MPS control deamon does not appear to be running. Automatically starting...\n"); 172 tail += sprintf(tail, "%d,", last_enabled);
126 ret = system("nvidia-cuda-mps-control -d"); 173 in_range = false;
127 if (ret == -1)
128 error(1, errno, "Unable to run subshell to start MPS");
129 if (ret == 1) {
130 fprintf(stderr, "nvtaskset: Error starting MPS control deamon. Terminating...\n");
131 return 1;
132 } 174 }
133 fprintf(stderr, "nvtaskset: Done. Use \"echo quit | nvidia-cuda-mps-control\" to terminate it later as desired.\n"); 175 continue;
176 }
177 if (enabled) {
178 if (last_enabled == i - 1) {
179 in_range = true;
180 tail += sprintf(tail, "-");
181 } else {
182 tail += sprintf(tail, "%d", i);
183 }
184 last_enabled = i;
185 } else {
186 if (last_enabled == i - 1) {
187 tail += sprintf(tail, ",");
188 }
189 }
190 }
191 // Strip trailing comma
192 if (*(tail - 1) == ',')
193 *(tail - 1) = '\0';
194 return list;
195}
196
197// Always returns a valid string
198// (Terminates the program on error)
199char* compose_gpc_list(uint128_t mask) {
200 uint32_t num_gpcs = 0;
201 uint128_t* masks = NULL;
202 libsmctrl_get_gpc_info_ext_easy(&num_gpcs, &masks, 0);
203 uint128_t gpc_mask = 0;
204 // Try to find correspondence between a list of TPCs and GPCs
205 for (int gpc = 0; gpc < num_gpcs; gpc++) {
206 if ((masks[gpc] & mask) == masks[gpc]) {
207 gpc_mask |= 1 << gpc;
208 mask &= ~masks[gpc];
134 } 209 }
135 // Tell loader to initialize libsmctrl.so first 210 }
136 // TODO: Append, rather than overwrite LD_PRELOAD 211 if (mask)
137 setenv("LD_PRELOAD", "libsmctrl.so", 1); 212 error(1, EINVAL, "Unable to interpret affinity as GPC list; try -t instead of -g");
138 // Explictly set the number of channels, otherwise CUDA will only use two 213 return compose_list(gpc_mask);
139 // (see paper for why that causes problems) 214}
140 setenv("CUDA_DEVICE_MAX_CONNECTIONS", "8", 1); 215
141 // Check if a mask, or a list of GPCs is being provided 216
142 if (strcmp(argv[1], "--gpc-list") == 0) { 217uint128_t* get_mask_hndl(pid_t target_pid) {
143 // TODO: Support the full syntax that taskset supports 218 char fd_path[277];
144 // We just support X,Y,Z for now 219 int fd;
145 uint32_t num_gpcs = 0; 220 uint128_t *mask_hndl;
146 uint128_t* masks = NULL; 221 DIR *dp;
147 // TODO: Allow specifying GPU ID, rather than assuming 0! 222 struct dirent *entry;
148 libsmctrl_get_gpc_info_ext_easy(&num_gpcs, &masks, 0); 223 // Search for the file descriptor which represents the libsmctrl control
149 uint128_t mask = 0; 224 // region.
150 int range_start_gpc = -1; 225 snprintf(fd_path, 277, "/proc/%d/fd/", target_pid);
151 char* start = argv[2]; 226 if (!(dp = opendir(fd_path))) {
152 int len = strlen(argv[2]); 227 if (errno == ENOENT)
153 // TODO: Handle invalid input cleanly. 228 error(1, 0, "Unable to find PID %d.", target_pid);
154 // Convert comma-seperated GPC list into a mask 229 else
155 for (int i = 0; i < len + 1; i++) { 230 error(1, errno, "Unable to access PID %d", target_pid);
156 if (argv[2][i] == ',' || argv[2][i] == '\0') { 231 }
157 argv[2][i] = '\0'; 232 while (entry = readdir(dp)) {
158 int gpc = atoi(start); 233 char link[sizeof(LINK_NAME)];
159 if (gpc > num_gpcs - 1) { 234 snprintf(fd_path, 277, "/proc/%d/fd/%s", target_pid, entry->d_name);
160 fprintf(stderr, "Invalid GPC ID '%s'!\n", start); 235 readlink(fd_path, link, sizeof(LINK_NAME));
236 if (strncmp(LINK_NAME, link, sizeof(LINK_NAME) - 1) == 0)
237 break;
238 }
239 closedir(dp);
240 if (!entry)
241 error(1, 0, "Unable to find libsmctrl-wrapper control region for PID %d.", target_pid);
242 // Access the shared memory region for libsmctrl control.
243 if ((fd = open(fd_path, O_RDWR)) == -1)
244 error(1, errno, "Unable to open libsmctrl-wrapper control file %s", fd_path);
245 mask_hndl = mmap(NULL, 16, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
246 if (mask_hndl == MAP_FAILED)
247 error(1, errno, "Unable to memory-map libsmctrl-wrapper control file %s", fd_path);
248 close(fd);
249 return mask_hndl;
250}
251
252static error_t arg_parser(int key, char* arg, struct argp_state *state){
253 static bool is_cmd = true;
254 static bool is_query = false;
255 static bool is_list = false;
256 static bool use_gpcs = false;
257 static uint128_t mask = 0;
258 static pid_t target_pid = 0;
259 static char **sub_argv = NULL;
260 char *end;
261 // Handle what to do in case of each option
262 switch (key) {
263 case 'g':
264 if (is_list)
265 argp_error(state, "Only one of -g and -t may be specified.\n");
266 use_gpcs = true;
267 is_list = true;
268 break;
269 case 't':
270 if (is_list)
271 argp_error(state, "Only one of -g and -t may be specified.\n");
272 is_list = true;
273 break;
274 case 'p':
275 is_cmd = false;
276 break;
277 case ARGP_KEY_ARG:
278 // Options:
279 // 1. -p and one argument -> Query mask for PID
280 // 2. -p and two arguments -> Set mask for PID
281 // 3. No -p and at least one argument -> Set mask and launch command
282 // (otherwise: invalid)
283 if (state->arg_num == 0 && !is_cmd && state->argc - state->next == 0)
284 is_query = true;
285 // Handle invalid and valid query cases
286 if (is_query) {
287 if (state->arg_num == 0) {
288 target_pid = strtoul(arg, &end, 10);
289 if (*end != '\0')
290 argp_error(state, "Invalid character \"%c\" in PID argument.\n", *end);
291 break;
292 } else
293 return ARGP_ERR_UNKNOWN;
294 }
295 // Handle non-query cases
296 if (state->arg_num == 0 && state->argc - state->next != 0) {
297 if (is_list) {
298 parse_list(use_gpcs, arg, &mask);
299 } else {
300 // strtoul stores a pointer to the first invalid character in `end`
301 mask = strtou128(arg, &end, 16);
302 if (*end != '\0')
303 argp_error(state, "Invalid character \"%c\" in mask argument.\n", *end);
304 }
305 } else if (state->arg_num == 1 && !is_cmd) {
306 target_pid = strtoul(arg, &end, 10);
307 if (*end != '\0')
308 argp_error(state, "Invalid character \"%c\" in PID argument.\n", *end);
309 } else
310 return ARGP_ERR_UNKNOWN;
311 break;
312 case ARGP_KEY_ARGS:
313 if (!is_cmd)
314 return ARGP_ERR_UNKNOWN;
315 sub_argv = state->argv + state->next;
316 break;
317 case ARGP_KEY_END:
318 if (is_query && state->arg_num < 1)
319 argp_usage(state);
320 else if (!is_query && state->arg_num < 2)
321 argp_usage(state);
322 break;
323 case ARGP_KEY_FINI:
324 if (is_query) {
325 // query PID
326 uint128_t* mask_hndl = get_mask_hndl(target_pid);
327 uint128_t enable_mask = ~*mask_hndl;
328 if (use_gpcs & is_list)
329 printf("PID %d's current GPC affinity list: %s\n", target_pid, compose_gpc_list(enable_mask));
330 else if (use_gpcs & !is_list)
331 argp_error(state, "Unsupported to print query as a GPC mask.\n");
332 else if (is_list)
333 printf("PID %d's current TPC affinity list: %s\n", target_pid, compose_list(enable_mask));
334 else
335 printf("PID %d's current TPC affinity mask: 0x%.0lx%016lx\n", target_pid, (uint64_t)(enable_mask >> 64), (uint64_t)enable_mask);
336 } else if (is_cmd) {
337 // start MPS (as needed)
338 if (!libsmctrl_is_mps_running()) {
339 fprintf(stderr, "nvtaskset: MPS control deamon does not appear to be running. Automatically starting...\n");
340 int ret = system("nvidia-cuda-mps-control -d");
341 if (ret == -1)
342 error(1, errno, "Unable to run subshell to start MPS");
343 if (ret == 1) {
344 fprintf(stderr, "nvtaskset: Error starting MPS control deamon. Terminating...\n");
161 return 1; 345 return 1;
162 } 346 }
163 // Handle ranges 347 fprintf(stderr, "nvtaskset: Done. Use \"echo quit | nvidia-cuda-mps-control\" to terminate it later as desired.\n");
164 if (range_start_gpc != -1) {
165 if (range_start_gpc >= gpc) {
166 fprintf(stderr, "Invalid GPC range!\n");
167 return 1;
168 }
169 while (range_start_gpc <= gpc) {
170 //printf("gpc %i\n", range_start_gpc);
171 mask |= masks[range_start_gpc];
172 range_start_gpc++;
173 }
174 range_start_gpc = -1;
175 } else {
176 //printf("gpc %i\n", gpc);
177 mask |= masks[gpc];
178 }
179 start = argv[2] + i + 1;
180 } 348 }
181 // Range start 349 // launch subprocess
182 if (argv[2][i] == '-') { 350 // Convert to string, prefix with ~, and set env var
183 argv[2][i] = '\0'; 351 char mask_str[32+3+1]; // 32 hexits, "~0x", and '\0'
184 range_start_gpc = atoi(start); 352 snprintf(mask_str, 36, "~0x%.0lx%016lx", (uint64_t)(mask >> 64), (uint64_t)mask);
185 start = argv[2] + i + 1; 353 setenv("LIBSMCTRL_MASK", mask_str, 1);
354 // Start task
355 execvp(sub_argv[0], sub_argv);
356 error(1, errno, "Unable to launch task '%s'", sub_argv[0]);
357 } else {
358 if (!libsmctrl_is_mps_running())
359 printf("Warning: NVIDIA MPS is not running. CUDA programs will not co-run! Run nvidia-cuda-mps-control -d before launching any CUDA-using programs that should co-run.\n");
360 // change mask on PID
361 uint128_t* mask_hndl = get_mask_hndl(target_pid);
362 if (!is_list) {
363 printf("PID %d's current TPC affinity mask: 0x%.0lx%016lx\n", target_pid, ~(uint64_t)(*mask_hndl >> 64), ~(uint64_t)*mask_hndl);
364 printf("PID %d's new TPC affinity mask: 0x%.0lx%016lx\n", target_pid, (uint64_t)(mask >> 64), (uint64_t)mask);
365 } else {
366 printf("PID %d's current TPC affinity list: %s\n", target_pid, compose_list(~*mask_hndl));
367 printf("PID %d's new TPC affinity list: %s\n", target_pid, compose_list(mask));
186 } 368 }
369 // Write the requested mask into the shared memory region
370 *mask_hndl = ~mask;
187 } 371 }
188 // Convert to string, prefix with ~, and set env var 372 break;
189 char mask_str[32+3+1]; // 32 hexits, "~0x", and '\0' 373 default:
190 snprintf(mask_str, 36, "~0x%lx%016lx", (uint64_t)(mask >> 64), (uint64_t)mask); 374 return ARGP_ERR_UNKNOWN;
191 //printf("nvtaskset: Using mask string %s\n", mask_str);
192 setenv("LIBSMCTRL_MASK", mask_str, 1);
193 // Start task
194 execvp(argv[3], argv+3);
195 error(1, errno, "Unable to launch task '%s'", argv[3]);
196 } else {
197 // Tell libsmctrl what mask to use
198 char* mask = malloc(strlen(argv[1]) + 2);
199 mask[0] = '~'; // Make an enable mask
200 strcpy(mask+1, argv[1]);
201 setenv("LIBSMCTRL_MASK", mask, 1);
202 free(mask); // setenv() made a copy
203 // Start task
204 execvp(argv[2], argv+2);
205 error(1, errno, "Unable to launch task '%s'", argv[2]);
206 }
207 } 375 }
208 fprintf(stderr, "Invalid arguments\n"); 376 return 0;
209 return 1; 377}
378
379struct argp argp = {opts, arg_parser, args_doc, desc};
380
381int main(int argc, char **argv) {
382 argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, NULL);
383 return 0;
210} 384}