Rewrite nvtaskset and implementation of partitioning for unmodified tasks

Rather than requiring libsmctrl.so to be preloaded, we now wrap libcuda.so.1. All CUDA-using applications will load libcuda.so.1, ensuring that our wrapper will always be dynamically loaded, no matter if LD_PRELOAD is enabled, or if a program has been staticly linked. All that needs to be done is that the location of our "fake" libcuda.so.1 need to be put within the loader search path. This can be done by setting LD_LIBRARY_PATH, or by installing our wrapper into /lib/x86_64-linux-gnu. The mask can still be set via the LIBSMCTRL_MASK environment variable, but the easier-to-use nvtaskset tool is now the recommended way to view or change the supreme TPC mask for any CUDA-using application. This allows launching a program on the first two GPCs via a command as simple as: ./nvtaskset -g 0-1 ./a_program a_program_args (Note that use of the -g option requires the nvdebug kernel module to first be loaded.) These changes support the final version of the ECRTS'25 paper. Note that nvtaskset does not yet fully support multi-GPU systems. Bugfixes: - Fix crash that would occur if both libsmctrl.so and libsmctrl.a were built into an application. - Correctly use GPU ID when initializing a context in `libsmctrl_test_gpc_info`. - Include `nvtaskset` as a prerequisite for `libsmctrl_test_supreme_mask`. - Fix malfunction of `libsmctrl_test_gpc_info` if CUDA_VISIBLE_DEVICES is set. Other minor changes: - Adds make target to run all the tests. - Fixes typos in comments. - Enables -Wall build option. - Upgrades supreme mask from 64 to 128 bits. - Removes `detect_parker_soc()` from the global namespace. - Adjusts test messages to be more succinct. - Updates README with overview of how to partition unmodified applications, more details on the tests, and information on the new ECRTS'25 paper.
author: Joshua Bakita <jbakita@cs.unc.edu> 2025-06-16 19:29:07 -0400
committer: Joshua Bakita <jbakita@cs.unc.edu> 2025-06-17 14:01:49 -0400
commit: 89177fce34edb5ad0059a41548888d05588cc1c5 (patch)
tree: 096dc302bb5e17e3987c45a59ef02c69ec73e9ed
parent: 03ae77e35d35b2a82f5387d1903cfa954b696edd (diff)
7 files changed, 659 insertions, 240 deletions
diff --git a/.gitignore b/.gitignore
index 5f0fdbe..c42b364 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,8 @@ libsmctrl_test_stream_mask
 libsmctrl_test_stream_mask_override
 libsmctrl_test_next_mask
 libsmctrl_test_next_mask_override
+libcuda.so.1
+nvtaskset
 *.pyc
 *.o
 .gdb_history
diff --git a/Makefile b/Makefile
index 62ec245..87a5708 100644
--- a/Makefile
+++ b/Makefile
@@ -3,9 +3,11 @@ CUDA ?= /usr/local/cuda
 # Note that CXX and CC are predefined as g++ and cc (respectively) by Make
 NVCC ?= $(CUDA)/bin/nvcc
 # Everything has to have -lcuda, as it's needed for libsmctrl
-LDFLAGS := -lcuda -I$(CUDA)/include -L$(CUDA)/lib64
+LDFLAGS := -ldl -lcuda -I$(CUDA)/include -L$(CUDA)/lib64
+ARCH = $(shell $(CC) -dumpmachine)
+CFLAGS := -Wall -Wno-parentheses
-.PHONY: clean tests all
+.PHONY: clean tests all install remove run_tests
 # ----- Main Library -----
 libsmctrl.so: libsmctrl.c libsmctrl.h
@@ -14,9 +16,14 @@ libsmctrl.so: libsmctrl.c libsmctrl.h
 # -fPIC is needed even if built as a static library, in case we are linked into
 # another shared library
 libsmctrl.a: libsmctrl.c libsmctrl.h
-        $(CC) $< -c -o libsmctrl.o -fPIC $(CFLAGS) $(LDFLAGS)
+        $(CC) $< -c -o libsmctrl.o -fPIC -DLIBSMCTRL_STATIC $(CFLAGS) $(LDFLAGS)
        ar rcs $@ libsmctrl.o
+# ----- CUDA Wrapper -----
+libcuda.so.1: libsmctrl.c libsmctrl.h
+        $(CC) $< -shared -o $@ -fPIC -DLIBSMCTRL_WRAPPER $(CFLAGS) $(LDFLAGS)
+        patchelf libcuda.so.1 --add-needed libcuda.so
 # ----- Utilities -----
 # Use static linking with tests to avoid LD_LIBRARY_PATH issues
 nvtaskset: nvtaskset.c libsmctrl.so libsmctrl.a
@@ -29,7 +36,7 @@ libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c libsmctrl.a testbench.h
 libsmctrl_test_mask_shared.o: libsmctrl_test_mask_shared.cu testbench.h
        $(NVCC) -ccbin $(CXX) $< -c -g
-libsmctrl_test_supreme_mask: libsmctrl_test_supreme_mask.c libsmctrl.a libsmctrl_test_mask_shared.o
+libsmctrl_test_supreme_mask: libsmctrl_test_supreme_mask.c libsmctrl.a libsmctrl_test_mask_shared.o libcuda.so.1 nvtaskset
        $(NVCC) -ccbin $(CXX) $@.c -o $@ libsmctrl_test_mask_shared.o -g -L. -l:libsmctrl.a $(LDFLAGS)
 libsmctrl_test_global_mask: libsmctrl_test_global_mask.c libsmctrl.a libsmctrl_test_mask_shared.o
@@ -52,7 +59,7 @@ tests: libsmctrl_test_gpc_info libsmctrl_test_supreme_mask \
       libsmctrl_test_stream_mask_override libsmctrl_test_next_mask \
       libsmctrl_test_next_mask_override
-all: libsmctrl.so nvtaskset tests
+all: libsmctrl.so libcuda.so.1 nvtaskset tests
 clean:
        rm -f libsmctrl.so libsmctrl.o libsmctrl.a libsmctrl_test_gpc_info \
@@ -60,4 +67,32 @@ clean:
              libsmctrl_test_global_mask \
              libsmctrl_test_stream_mask libsmctrl_test_stream_mask_override \
              libsmctrl_test_next_mask libsmctrl_test_next_mask_override \
-              nvtaskset
+              nvtaskset libcuda.so.1
+install: libcuda.so.1
+        @# Check that CUDA is installed first
+        test -f /lib/$(ARCH)/libcuda.so.*.*
+        @# Change libcuda.so link to bypass libcuda.so.1
+        sudo ln -sf /lib/$(ARCH)/libcuda.so.*.* /lib/$(ARCH)/libcuda.so
+        @# Remove libcuda.so.1 symlink
+        sudo rm /lib/$(ARCH)/libcuda.so.1
+        @# Install wrapper as libcuda.so.1
+        sudo cp libcuda.so.1 /lib/$(ARCH)/libcuda.so.1
+remove:
+        @# Test that our library in installed first
+        test ! -L /lib/$(ARCH)/libcuda.so.1
+        @# Overwrite install with original symlinks
+        sudo ln -sf libcuda.so.1 /lib/$(ARCH)/libcuda.so
+        sudo ln -sf /lib/$(ARCH)/libcuda.so.*.* /lib/$(ARCH)/libcuda.so.1
+run_tests: tests
+        ./libsmctrl_test_global_mask
+        ./libsmctrl_test_next_mask
+        ./libsmctrl_test_stream_mask
+        ./libsmctrl_test_next_mask_override
+        ./libsmctrl_test_stream_mask_override
+        @# Must set LD_LIBRARY_PATH in case make install has not been run
+        LD_LIBRARY_PATH=. ./libsmctrl_test_supreme_mask
+        ./libsmctrl_test_gpc_info
+        @ echo "All tests passed!"
diff --git a/README.md b/README.md
index f2be718..11ad153 100644
--- a/README.md
+++ b/README.md
@@ -13,16 +13,37 @@ Please cite this paper in any work which leverages our library. Here's the BibTe
  year={2023},
  month={May},
  pages={54--66},
+  doi={10.1109/RTAS58335.2023.00012},
  _series={RTAS}
 }
 ```
-Please see [the paper](https://www.cs.unc.edu/~jbakita/rtas23.pdf) and `libsmctrl.h` for details and examples of how to use this library.
+The ability for `libsmctrl` to work on unmodified tasks was developed as part of a follow-up paper:
+_J. Bakita and J. H. Anderson, "Hardware Compute Partitioning on NVIDIA GPUs for Composable Systems", Proceedings of the 37th Euromicro Conference on Real-Time Systems, pp. 18:1-18:24, July 2025._
+Please cite this paper in any work which uses this for partitioning unmodified tasks. Here's the BibTeX entry:
+```
+@inproceedings{bakita2025hardware,
+  title={Hardware Compute Partitioning on {NVIDIA} {GPUs} for Composable Systems},
+  author={Bakita, Joshua and Anderson, James H},
+  booktitle={Proceedings of the 37th Euromicro Conference on Real-Time Systems},
+  year={2025},
+  month={July},
+  pages={18:1--18:24},
+  doi={10.1109/ECRTS.2025.18},
+  _series={ECRTS}
+}
+```
+Please see [the first paper](https://www.cs.unc.edu/~jbakita/rtas23.pdf), [the second paper](https://www.cs.unc.edu/~jbakita/ecrts25.pdf) and `libsmctrl.h` for details and examples of how to use this library.
 We strongly encourage consulting those resources first; the below comments serve merely as an appendum.
 ## Run-time Dependencies
 `libcuda.so`, which is automatically installed by the NVIDIA GPU driver.
+(Technically `libdl` is also required, but this should never need to be manually installed. This is a dependency of CUDA, and is also part of the GNU C Standard Library starting with version 2.34.)
 ## Building
 To build, ensure that you have `gcc` installed and access to the CUDA SDK including `nvcc`. Then run:
 ```
@@ -66,8 +87,52 @@ nvcc benchmark.cu -o benchmark -I/playpen/libsmctl -lsmctrl -lcuda -L/playpen/li
 ```
 The resultant `benchmark` binary should be portable to any system with an equivalent or newer version of the NVIDIA GPU driver installed.
+## Use Without Application Modification
+As an alternative to modifying your application, `libsmctrl` can be installed system-wide, and partitions for each application can be set via the `nvtaskset` tool.
+The `nvtaskset` tool works very similarly to the Linux CPU-affinity-setting tool `taskset`.
+To install `libsmctrl` system-wide, such that all CUDA-using applications automatically load it, ensure that `patchelf` is installed (`sudo apt install patchelf`), and run:
+```
+make libcuda.so.1 install
+```
+Or, if you do not want to modify any system-wide state, and only want `libsmctrl` loaded as part of anything run from this console:
+```
+make libcuda.so.1
+export LD_LIBRARY_PATH=$(pwd)
+```
+(This works because CUDA is always dynamically loaded from `libcuda.so.1`, and `lbsmctrl` creates a "fake" `libcuda.so.1` in this directory that wraps CUDA.
+ Setting `LD_LIBRARY_PATH` ensures that the wrapped version is the first one loaded.
+ The only difference with running `make install` is that it copies our "fake" `libcuda.so.1` to a location where the loader will automatically find it.)
+And then to start an application within a specific TPC partition, e.g., the first 10 TPCs:
+```
+./nvtaskset -t 0-9 my_program my_args
+```
+Note that this will automatically start NVIDIA MPS, which is a prerequisite to co-run tasks on NVIDIA GPUs without timeslicing.
+And to change the TPCs available for a process ID 1234 to to the first 10 TPCs:
+```
+./nvtaskset -tp 0-9 1234
+```
+Or, to change a process of ID 1234 to only run on GPC 3:
+```
+./nvtaskset -gp 3 1234
+```
+To remove the system-wide installation of `libsmctrl`, run:
+```
+make remove
+```
 ## Run Tests
-To test partitioning:
+To run them all:
+```
+make run_tests
+```
+If you prefer to run them individually, to test partitioning:
 ```
 make tests
 ./libsmctrl_test_global_mask
@@ -82,18 +147,26 @@ make tests
 ./libsmctrl_test_next_mask_override
 ```
-And if `nvdebug` has been installed:
+To test that `nvtaskset` can dynamically change the mask of a running program:
 ```
-make tests
+make libsmctrl_test_supreme_mask
+./libsmctrl_test_supreme_mask
+```
+To test that TPC to GPC mappings can be obtained (if `nvdebug` has been installed):
+```
+make libsmctrl_test_gpc_info
 ./libsmctrl_test_gpc_info
 ```
+The `CUDA_VISIBLE_DEVICES` environment variable can be set to run any of the partitioning tests on a different GPU.
 ## Supported GPUs
 #### Known Working
 - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs
- CUDA 6.5 through 12.6
+- CUDA 6.5 through 12.8
 - `x86_64` and Jetson `aarch64` platforms
 #### Known Issues
diff --git a/libsmctrl.c b/libsmctrl.c
index 6aa471b..79d2b33 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -17,22 +17,27 @@
 * Please contact the authors if support is needed for a particular feature on
 * an older CUDA version. Support for those is unimplemented, not impossible.
 *
- * An old implementation of this file effected the global mask on CUDA 10.2 by
+ * An old implementation of this file affected the global mask on CUDA 10.2 by
 * changing a field in CUDA's global struct that CUDA applies to the QMD/TMD.
 * That implementation was extraordinarily complicated, and was replaced in
 * 2024 with a more-backward-compatible way of hooking the TMD/QMD.
 * View the old implementation via Git: `git show aa63a02e:libsmctrl.c`.
 */
+#define _GNU_SOURCE // To enable use of memfd_create()
 #include <cuda.h>
 #include <errno.h>
 #include <error.h>
+#include <dlfcn.h>
 #include <fcntl.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
-#include <sys/ipc.h>
+#include <string.h>
-#include <sys/shm.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
 #include <unistd.h>
 #include "libsmctrl.h"
@@ -48,20 +53,33 @@
 // (No testing attempted on pre-CUDA-6.5 versions)
 // Values for the following three lines can be extracted by tracing CUPTI as
 // it interects with libcuda.so to set callbacks.
-static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b};
+static const CUuuid callback_funcs_id = {{0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}};
 // These callback descriptors appear to intercept the TMD/QMD late enough that
 // CUDA has already applied the per-stream mask from its internal data
 // structures, allowing us to override it with the next mask.
 #define QMD_DOMAIN 0xb
 #define QMD_PRE_UPLOAD 0x1
+/**
+ * These globals must be non-static (i.e., have global linkage) to ensure that
+ * if multiple copies of the library are loaded (e.g., dynamically linked to
+ * both this program and a dependency), secondary copies do not attempt to
+ * repeat initialization or make changes to unused copies of mask values.
+ */
 // Supreme mask (cannot be overridden)
-static uint64_t *g_supreme_sm_mask = NULL;
+uint128_t *g_supreme_sm_mask = NULL;
 // Global mask (applies across all threads)
-static uint64_t g_sm_mask = 0;
+uint64_t g_sm_mask = 0;
 // Next mask (applies per-thread)
-static __thread uint64_t g_next_sm_mask = 0;
+__thread uint64_t g_next_sm_mask = 0;
 // Flag value to indicate if setup has been completed
-static bool sm_control_setup_called = false;
+bool sm_control_setup_called = false;
+#ifdef LIBSMCTRL_STATIC
+// Special handling for if built as a static library, and the libcuda.so.1
+// libsmctrl wrapper is in use (see comment on setup() constructor for detail).
+static void (*shared_set_global_mask)(uint64_t) = NULL;
+static void (*shared_set_next_mask)(uint64_t) = NULL;
+#endif
 // v1 has been removed---it intercepted the TMD/QMD too early, making it
 // impossible to override the CUDA-injected stream mask with the next mask.
@@ -78,7 +96,7 @@ static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in
        if (!tmd)
                abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n");
-        uint32_t *lower_ptr, *upper_ptr;
+        uint32_t *lower_ptr, *upper_ptr, *ext_lower_ptr, *ext_upper_ptr;
        // The location of the TMD version field seems consistent across versions
        uint8_t tmd_ver = *(uint8_t*)(tmd + 72);
@@ -87,10 +105,12 @@ static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in
                // TMD V04_00 is used starting with Hopper to support masking >64 TPCs
                lower_ptr = tmd + 304;
                upper_ptr = tmd + 308;
+                ext_lower_ptr = tmd + 312;
+                ext_upper_ptr = tmd + 316;
                // XXX: Disable upper 64 TPCs until we have ...next_mask_ext and
                //      ...global_mask_ext
-                *(uint32_t*)(tmd + 312) = -1;
+                *ext_lower_ptr = -1;
-                *(uint32_t*)(tmd + 316) = -1;
+                *ext_upper_ptr = -1;
                // An enable bit is also required
                *(uint32_t*)tmd |= 0x80000000;
        } else if (tmd_ver >= 0x16) {
@@ -119,6 +139,10 @@ static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in
        if (g_supreme_sm_mask) {
                *lower_ptr |= (uint32_t)*g_supreme_sm_mask;
                *upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 32);
+                if (tmd_ver >= 0x40) {
+                        *ext_lower_ptr |= (uint32_t)(*g_supreme_sm_mask >> 64);
+                        *ext_upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 96);
+                }
        }
        //fprintf(stderr, "Final SM Mask (lower): %x\n", *lower_ptr);
@@ -163,12 +187,26 @@ static void setup_sm_control_callback() {
 // Set default mask for all launches
 void libsmctrl_set_global_mask(uint64_t mask) {
+#ifdef LIBSMCTRL_STATIC
+        // Special handling for if built as a static library, and the libcuda.so.1
+        // libsmctrl wrapper is in use (see comment on setup() constructor for
+        // detail).
+        if (shared_set_global_mask)
+                return (*shared_set_global_mask)(mask);
+#endif
        setup_sm_control_callback();
        g_sm_mask = mask;
 }
 // Set mask for next launch from this thread
 void libsmctrl_set_next_mask(uint64_t mask) {
+#ifdef LIBSMCTRL_STATIC
+        // Special handling for if built as a static library, and the libcuda.so.1
+        // libsmctrl wrapper is in use (see comment on setup() constructor for
+        // detail).
+        if (shared_set_next_mask)
+                return (*shared_set_next_mask)(mask);
+#endif
        setup_sm_control_callback();
        g_next_sm_mask = mask;
 }
@@ -248,7 +286,7 @@ struct stream_sm_mask_v2 {
 // (CUDA 9.0 behaves slightly different on this platform.)
 // @return 1 if detected, 0 if not, -cuda_err on error
 #if __aarch64__
-int detect_parker_soc() {
+static int detect_parker_soc() {
        int cap_major, cap_minor, err, dev_count;
        if (err = cuDeviceGetCount(&dev_count))
                return -err;
@@ -272,7 +310,7 @@ int detect_parker_soc() {
 }
 #endif // __aarch64__
-// Should work for CUDA 8.0 through 12.6
+// Should work for CUDA 8.0 through 12.8
 // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in
 // our header
 void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
@@ -417,7 +455,8 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
        }
 }
-/* INFORMATIONAL FUNCTIONS */
+/*** TPC and GPU Informational Functions ***/
 // Read an integer from a file in `/proc`
 static int read_int_procfile(char* filename, uint64_t* out) {
@@ -590,32 +629,98 @@ abort_cuda:
        return EIO;
 }
+/*** Private functions for nvtaskset and building as a libcuda.so.1 wrapper ***/
+// Check if NVIDIA MPS is running, following the process that `strace` shows
+// `nvidia-cuda-mps-control` to use. MPS is a prerequisite to co-running
+// multiple GPU-using tasks without timeslicing.
+bool libsmctrl_is_mps_running() {
+        char *mps_pipe_dir;
+        int mps_ctrl;
+        struct sockaddr_un mps_ctrl_addr;
+        mps_ctrl_addr.sun_family = AF_UNIX;
+        const int yes = 1;
+        if (!(mps_pipe_dir = getenv("CUDA_MPS_PIPE_DIRECTORY")))
+                mps_pipe_dir = "/tmp/nvidia-mps";
+        // Pipe names are limited to 108 characters long
+        snprintf(mps_ctrl_addr.sun_path, 108, "%s/control", mps_pipe_dir);
+        // This mirrors the process `nvidia-cuda-mps-control` uses to detect MPS
+        if ((mps_ctrl = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1)
+                return false;
+        if (setsockopt(mps_ctrl, SOL_SOCKET, SO_PASSCRED, &yes, sizeof(yes)) == -1)
+                return false;
+        if (connect(mps_ctrl, &mps_ctrl_addr, sizeof(struct sockaddr_un)) == -1)
+                return false;
+        close(mps_ctrl);
+        return true;
+}
+// A variant of strtoul with support for 128-bit integers
+uint128_t strtou128(const char *nptr, char **endptr, int base) {
+        unsigned __int128 result = 0;
+        if (base != 16)
+                error(1, EINVAL, "strtou128 only supports base 16");
+        // Skip a "0x" prefix. Safe due to early evaluation
+        if (*nptr == '0' && (*(nptr + 1) == 'x' || *(nptr + 1) == 'X'))
+                nptr += 2;
+        // Until hitting an invalid character
+        while (1) {
+                if (*nptr >= 'a' && *nptr <= 'f')
+                        result = result << 4 | (*nptr - 'a' + 10);
+                else if (*nptr >= 'A' && *nptr <= 'F')
+                        result = result << 4 | (*nptr - 'A' + 10);
+                else if (*nptr >= '0' && *nptr <= '9')
+                        result = result << 4 | (*nptr - '0');
+                else
+                        break;
+                nptr++;
+        }
+        if (endptr)
+                *endptr = (char*)nptr;
+        return result;
+}
+#ifdef LIBSMCTRL_WRAPPER
+// The CUDA runtime library uses dlopen() to load CUDA functions from
+// libcuda.so.1. Since we replace that with our wrapper library, we need to
+// also redirect any attempted opens of that shared object to the actual
+// shared library, which is linked to by libcuda.so.
+void *dlopen(const char *filename, int flags) {
+        if (filename && strcmp(filename, "libcuda.so") == 0) {
+                fprintf(stderr, "redirecting dlopen of %s to libcuda.so\n", filename);
+                // A GNU-only dlopen variant
+                return dlmopen(LM_ID_BASE, "libcuda.so", flags);
+        } else
+                return dlmopen(LM_ID_BASE, filename, flags);
+}
 // Allow setting a default mask via an environment variable
 // Also enables libsmctrl to be used on unmodified programs via setting:
-//   LD_PRELOAD=libsmctrl.so LIBSMCTRL_MASK=<your mask> ./my_program
+//   LD_LIBRARY_PATH=libsmctrl LIBSMCTRL_MASK=<your mask> ./my_program
 // Where "<your mask>" is replaced with a disable mask, optionally prefixed
 // with a ~ to invert it (make it an enable mask).
 __attribute__((constructor)) static void setup(void) {
        char *end, *mask_str;
        // If dynamic changes are disabled (due to an error) this variable is
-        // permanently used to store the supreme mask, rather than the SysV shared
+        // permanently used to store the supreme mask, rather than the shared
        // memory segment.
-        static uint64_t mask;
+        static uint128_t mask;
        bool invert = false;
-        int shmid;
-        key_t shm_key;
        mask_str = getenv("LIBSMCTRL_MASK");
+        // Assume no mask if unspecified
        if (!mask_str)
-                return;
+                mask_str = "0";
        if (*mask_str == '~') {
                invert = true;
                mask_str++;
        }
-        // XXX: Doesn't support 128-bit masks
+        mask = strtou128(mask_str, &end, 16);
-        mask = strtoull(mask_str, &end, 0);
        // Verify we were able to parse the whole string
        if (*end != '\0')
                abort(1, EINVAL, "Unable to apply default mask");
@@ -623,35 +728,64 @@ __attribute__((constructor)) static void setup(void) {
        if (invert)
                mask = ~mask;
+        // Explictly set the number of channels (if unset), otherwise CUDA will only
+        // use two with MPS (see paper for why that causes problems)
+        if (setenv("CUDA_DEVICE_MAX_CONNECTIONS", "8", 0) == -1)
+                abort(1, EINVAL, "Unable to configure environment");
+        // Warn if a mask was specified but MPS isn't running
+        if (mask && !libsmctrl_is_mps_running())
+                fprintf(stderr, "libsmctrl-libcuda-wrapper: Warning: TPC mask set via LIBSMCTRL_MASK, but NVIDIA MPS is not running. CUDA programs will not co-run!\n");
        // Initialize CUDA and the interception callback
        setup_sm_control_callback();
-        // TODO: Switch to memfd_create(); this leaks IPC objects
+        // Create shared memory region for the supreme mask such that nvtaskset
-        // Create a SysV IPC key (32 bits) to identify our shared memory region
+        // can read and modify it
-        // Use the pid as the top 16 bits, and "sm" as the bottom 16
+        int fd = memfd_create("libsmctrl", MFD_CLOEXEC);
-        shm_key = getpid();
+        if (fd == -1) {
-        shm_key <<= 16;
-        shm_key |= (int)'s' << 8 | (int)'m';
-        // Obtain or create a 128-bit (16-byte) shared memory region
-        shmid = shmget(shm_key, 16, IPC_CREAT | 0600);
-        if (shmid == -1) {
                abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled");
                g_supreme_sm_mask = &mask;
                return;
        }
-        // Open the shared memory region
+        if (ftruncate(fd, 16) == -1) {
-        g_supreme_sm_mask = shmat(shmid, NULL, 0);
+                abort(0, errno, "Unable to resize shared memory for dynamic partition changes. Dynamic changes disabled");
-        if (g_supreme_sm_mask == (void*)-1) {
+                g_supreme_sm_mask = &mask;
-                abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled");
+                return;
+        }
+        if ((g_supreme_sm_mask = mmap(NULL, 16, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)) == MAP_FAILED) {
+                abort(0, errno, "Unable to map shared memory for dynamic partition changes. Dynamic changes disabled");
                g_supreme_sm_mask = &mask;
                return;
        }
-        // XXX: This makes the region unopenable to everyone else. Switch to memfd!
-        // Mark the shared memory region for deletion (after we terminate)
-        if (shmctl(shmid, IPC_RMID, NULL) == -1)
-                abort(0, errno, "Unable to mark shared memory for dynamic partition changes for deletion on process termination. Will leak one page of memory.");
        // Set the super-global mask which cannot be overwritten by any libsmctrl
        // API function.
        *g_supreme_sm_mask = mask;
 }
+#elif defined(LIBSMCTRL_STATIC)
+// If this library is statically built into a program, and the libcuda.so.1
+// wrapper is enabled, we force the staticlly linked version of the library
+// to defer to the function implementations in the wrapper.
+//
+// Longer explanation:
+// If the library has been dynamically linked into a program and the wrapper
+// is in use, the loader will point both to the same set of symbols (since both
+// will do a dynamic lookup at load-time, the global state at the top of this
+// file uses global linkage, and will thus be in the dynamic symbol table, and
+// each lookup will find the same copy.)
+// Symbols from a staticlly linked library are not included in the dynamic
+// symbol table, and thus can exist in duplicate of those in any shared
+// library. This is a problem, since only one callback function, using one set
+// of global variables can be registered with CUDA. We work around this by
+// having our statically linked library use the functions from the wrapper or
+// any shared library, if one such instance is loaded.
+__attribute__((constructor)) static void setup(void) {
+        // dlsym can only view the dynamic symbol tables, and so these lookups will
+        // fail if neither the wrapper (libcuda.so.1) nor libsmctrl.so are loaded.
+        // (That indicates that we should the static library implementations.)
+        // These are a NOP on failure since they return NULL when not found.
+        shared_set_next_mask = dlsym(RTLD_DEFAULT, "libsmctrl_set_next_mask");
+        shared_set_global_mask = dlsym(RTLD_DEFAULT, "libsmctrl_set_global_mask");
+}
+#endif
diff --git a/libsmctrl_test_gpc_info.c b/libsmctrl_test_gpc_info.c
index 558b80a..afa0876 100644
--- a/libsmctrl_test_gpc_info.c
+++ b/libsmctrl_test_gpc_info.c
@@ -38,6 +38,8 @@ int main(int argc, char** argv) {
                gpu_id = 0;
        // Tell CUDA to use PCI device id ordering (to match nvdebug)
        putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID");
+        // Allow CUDA to see all devices (to better match nvdebug)
+        unsetenv("CUDA_VISIBLE_DEVICES");
        // A CUDA context is required before reading the topology information
        if ((res = cuInit(0))) {
                const char* name;
@@ -45,7 +47,7 @@ int main(int argc, char** argv) {
                fprintf(stderr, "%s: Unable to initialize CUDA, error %s\n", program_invocation_name, name);
                return 1;
        }
-        if ((res = cuCtxCreate(&ctx, 0, 0))) {
+        if ((res = cuCtxCreate(&ctx, 0, gpu_id))) {
                const char* name;
                cuGetErrorName(res, &name);
                fprintf(stderr, "%s: Unable to create a CUDA context, error %s\n", program_invocation_name, name);
diff --git a/libsmctrl_test_mask_shared.cu b/libsmctrl_test_mask_shared.cu
index 3b7ebcd..8d2bd79 100644
--- a/libsmctrl_test_mask_shared.cu
+++ b/libsmctrl_test_mask_shared.cu
@@ -18,9 +18,8 @@ __global__ void read_and_store_smid(uint8_t* smid_arr) {
  smid_arr[blockIdx.x] = smid;
 }
-// Assuming SMs continue to support a maximum of 2048 resident threads, six
+// Need at least as many blocks as there are SMs on NVIDIA's biggest GPUs
-// blocks of 1024 threads should span at least three SMs without partitioning
+#define NUM_BLOCKS 142
-#define NUM_BLOCKS 142 //6
 static int sort_asc(const void* a, const void* b) {
  return *(uint8_t*)a - *(uint8_t*)b;
@@ -83,8 +82,11 @@ int test_constrained_size_and_location(enum partitioning_type part_type) {
    // Apply partitioning to enable only the first TPC of each 32-bit block
    switch (part_type) {
      case PARTITION_SUPREME:
-        printf("%s: Please set mask to '0x%016lx%016lx' for PID %d using the control deamon and press any key to continue...\n", program_invocation_name, (uint64_t)(mask >> 64), (uint64_t)mask, getpid());
+        char cmd[80];
-        fgetc(stdin);
+        // We must invert the mask before passing it to nvtaskset, since
+        // nvtaskset takes an enable mask (as with the taskset command)
+        snprintf(cmd, 80, "./nvtaskset -p 0x%.0lx%016lx %d > /dev/null", ~(uint64_t)(mask >> 64), ~(uint64_t)mask, getpid());
+        system(cmd);
        break;
      case PARTITION_GLOBAL:
        libsmctrl_set_global_mask(mask);
@@ -120,10 +122,9 @@ int test_constrained_size_and_location(enum partitioning_type part_type) {
    uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS); // Sorts too
    if (uniq_partitioned > sms_per_tpc) {
      printf("%s: ***Test failure.***\n"
-             "%s: Reason: With TPC mask set to "
+             "%s: Reason: With a partition of only one TPC, the test kernel "
-             "constrain all kernels to a single TPC, a kernel of %d blocks of "
+             "of %d blocks of 1024 threads ran on %d SMs (at most %d---one "
-             "1024 threads was launched and found to run on %d SMs (at most %d---"
+             "TPC---expected).\n", program_invocation_name, program_invocation_name, NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
-             "one TPC---expected).\n", program_invocation_name, program_invocation_name, NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
      return 1;
    }
@@ -131,18 +132,16 @@ int test_constrained_size_and_location(enum partitioning_type part_type) {
    if (smids_partitioned_h[NUM_BLOCKS - 1] > (enabled_tpc * sms_per_tpc) + sms_per_tpc - 1 ||
        smids_partitioned_h[NUM_BLOCKS - 1] < (enabled_tpc * sms_per_tpc)) {
      printf("%s: ***Test failure.***\n"
-             "%s: Reason: With TPC mask set to "
+             "%s: Reason: With a partition of only TPC %d, the test kernel "
-             "constrain all kernels to TPC %d, a kernel was run and found "
+             "ran on SM IDs as high as %d and as low as %d (range of %d to %d "
-             "to run on an SM IDs: as high as %d and as low as %d (range of %d to %d expected).\n",
+             "expected).\n", program_invocation_name, program_invocation_name, enabled_tpc, smids_partitioned_h[NUM_BLOCKS - 1], smids_partitioned_h[0], enabled_tpc * sms_per_tpc + sms_per_tpc - 1, enabled_tpc * sms_per_tpc);
-             program_invocation_name, program_invocation_name, enabled_tpc, smids_partitioned_h[NUM_BLOCKS - 1], smids_partitioned_h[0], enabled_tpc * sms_per_tpc + sms_per_tpc - 1, enabled_tpc * sms_per_tpc);
      return 1;
    }
    // Div by 32 via a shift
    asprintf(&reason[enabled_tpc >> 5],
-         "With a partition enabled which "
+         "With a partition of only TPC %d, the test kernel used only %d "
-         "contained only TPC ID %d, the test kernel was found to use only %d "
+         "SMs (%d without), and all had IDs between %d and %d (were contained"
-         "SMs (%d without), and all SMs in-use had IDs between %d and %d (were contained"
         " in TPC %d).", enabled_tpc, uniq_partitioned, uniq_native, smids_partitioned_h[0], smids_partitioned_h[NUM_BLOCKS - 1], enabled_tpc);
  }
diff --git a/nvtaskset.c b/nvtaskset.c
index 74f88d7..4901cbe 100644
--- a/nvtaskset.c
+++ b/nvtaskset.c
@@ -1,210 +1,384 @@
 // Copyright 2025 Joshua Bakita
-// taskset-like utility for the GPU
+// Show or change the GPU core affinity for a CUDA process
+// taskset-like utility for NVIDIA GPUs
 #define _GNU_SOURCE // For program_invocation_name
 #include <argp.h>
+#include <dirent.h>
 #include <errno.h>
 #include <error.h>
+#include <fcntl.h>
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/ipc.h>
+#include <sys/mman.h>
-#include <sys/shm.h>
-#include <sys/types.h>
 #include <unistd.h>
 #include <cuda.h> // To help with getting GPC info
 #include "libsmctrl.h"
-const char* maintainer = "<jbakita@cs.unc.edu>";
+#define LINK_NAME "/memfd:libsmctrl"
-const char* version = "nvtaskset 2025.03";
-const char* desc = "taskset-like utility for NVIDIA GPUs.";
+// TODO: Write automated tests:
+// - Change region of non-existent PID
+// - Change region of permission denied PID
+// - Change region of non-GPU PID
+// - Change GPC list
+// - Change TPC list
+// - Change TPC mask
+// - Start TPC mask
+// - Start TPC list
+// - Start GPC list
+// - Start with subargument containing -
+// - Query GPC list
+// - Query TPC list
+// - Query TPC mask
+// - Set GPC list w/ non-existant GPC
+// - Set TPC list w/ non-existant TPC
+// Private symbols from libsmctrl
+extern bool libsmctrl_is_mps_running();
+extern uint128_t strtou128(const char *nptr, char **endptr, int base);
+const char *argp_program_bug_address = "<jbakita@cs.unc.edu>";
+const char *argp_program_version = "nvtaskset 2025.06";
+const char desc[] = "Show or change the GPU core affinity for a CUDA process\v"
+                    "Warning: When using GPC lists, this tool currently "
+                    "derives TPC to GPC mappings from the first NVIDIA GPU in "
+                    "the system (by PCI bus ID) device. To use the mappings "
+                    "for a different device, use the `libsmctrl_test_get_info` "
+                    "tool to get the bitmask of TPCs associated with each GPC, "
+                    "OR them, and then set that bitmask via this tool. Better "
+                    "multi-GPU support is intended for a future release.\n\n"
+                    "Inspired by the Linux taskset utility.";
+const char args_doc[] = "[mask | list] [pid | cmd [args...]]";
 const struct argp_option opts[] = {
+        {"gpc-list", 'g', NULL, 0, "Specify partition as a list of GPCs"},
+        {"tpc-list", 't', NULL, 0, "Specify partition as a list of TPCs"},
+        {"pid",      'p', NULL, 0, "Operate on an existing PID"},
        {0}
 };
-unsigned __int128 strtou128(const char *nptr, char **endptr, int base) {
+// Create a CUDA context and query the associated GPC to TPC mappings
-        unsigned __int128 result = 0;
+// Based off logic in libsmctrl_test_gpc_info
-        if (base != 16)
-                error(1, EINVAL, "Internal error");
-        // Skip a "0x" prefix. Safe due to early evaluation
-        if (*nptr == '0' && (*(nptr + 1) == 'x' || *(nptr + 1) == 'X'))
-                nptr += 2;
-        // Until hitting an invalid character
-        while (1) {
-                if (*nptr >= 'a' && *nptr <= 'f')
-                        result = result << 4 | (*nptr - 'a' + 10);
-                else if (*nptr >= 'A' && *nptr <= 'F')
-                        result = result << 4 | (*nptr - 'A' + 10);
-                else if (*nptr >= '0' && *nptr <= '9')
-                        result = result << 4 | (*nptr - '0');
-                else
-                        break;
-                nptr++;
-        }
-        if (endptr)
-                *endptr = (char*)nptr;
-        return result;
-}
 void libsmctrl_get_gpc_info_ext_easy(uint32_t* num_gpcs, uint128_t** masks, int gpu_id) {
        int res;
        CUcontext ctx;
-        // XXX: Copied from libsmctrl_test_gpc_info
+        char *old_order = NULL;
-        // Tell CUDA to use PCI device id ordering (to match nvdebug)
+        // Tell CUDA to use PCI device id ordering (to match nvdebug)
-        putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID");
+        putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID");
-        // A CUDA context is required before reading the topology information
+        // Allow CUDA to see all devices (to better match nvdebug)
-        if ((res = cuInit(0))) {
+        if (getenv("CUDA_VISIBLE_DEVICES")) {
-                const char* name;
+                if (!(old_order = strdup(getenv("CUDA_VISIBLE_DEVICES"))))
-                cuGetErrorName(res, &name);
+                        error(1, errno, "Unable to allocate environment string");
-                fprintf(stderr, "%s: Unable to initialize CUDA, error %s\n", program_invocation_name, name);
+                unsetenv("CUDA_VISIBLE_DEVICES");
-                exit(1);
+        }
-        }
+        // A CUDA context is required before reading the topology information
-        if ((res = cuCtxCreate(&ctx, 0, 0))) {
+        if ((res = cuInit(0))) {
-                const char* name;
+                const char* name;
-                cuGetErrorName(res, &name);
+                cuGetErrorName(res, &name);
-                fprintf(stderr, "%s: Unable to create a CUDA context, error %s\n", program_invocation_name, name);
+                error(1, 0, "Unable to create a initialize CUDA, error %s\n", name);
-                exit(1);
+        }
-        }
+        if ((res = cuCtxCreate(&ctx, 0, gpu_id))) {
-        // Pull topology information from libsmctrl
+                const char* name;
-        if ((res = libsmctrl_get_gpc_info_ext(num_gpcs, masks, gpu_id)) != 0) {
+                cuGetErrorName(res, &name);
-                error(0, res, "libsmctrl_get_gpc_info() failed");
+                error(1, 0, "Unable to create a CUDA context, error %s\n", name);
-                if (res == ENOENT)
+        }
-                        fprintf(stderr, "%s: Is the nvdebug kernel module loaded?\n", program_invocation_name);
+        // Pull topology information from libsmctrl
-                if (res == EIO)
+        if ((res = libsmctrl_get_gpc_info_ext(num_gpcs, masks, gpu_id)) != 0) {
-                        fprintf(stderr, "%s: Is the GPU powered on, i.e., is there an active context?\n", program_invocation_name);
+                error(0, res, "libsmctrl_get_gpc_info() failed");
-                exit(1);
+                if (res == ENOENT)
-        }
+                        fprintf(stderr, "%s: Is the nvdebug kernel module loaded?\n", program_invocation_name);
-        // Not copied
+                if (res == EIO)
+                        fprintf(stderr, "%s: Is the GPU powered on, i.e., is there an active context?\n", program_invocation_name);
+                exit(1);
+        }
+        // Restore the environment (in case we exec() later)
        unsetenv("CUDA_DEVICE_ORDER");
+        if (old_order) {
+                setenv("CUDA_VISIBLE_DEVICES", old_order, 1);
+                free(old_order);
+        }
 }
-int main(int argc, char **argv) {
+int parse_list(bool use_gpcs, char* list, uint128_t *mask_out) {
-        if (argc < 3) {
+        // We support the same ranges as taskset, e.g., X,Y,Z and X,Y-Z
-                fprintf(stderr, "Usage: %s -p <hex mask> <pid>\n", argv[0]);
+        uint32_t num_xpcs = 0; // Either TPC or GPC count, i.e., "X"PC
-                fprintf(stderr, "       %s <hex mask> <command> <argument...>\n", argv[0]);
+        uint128_t* masks = NULL;
-                fprintf(stderr, "       %s --gpc-list <gpc list> <command> <argument...>\n", argv[0]);
+        // TODO: Allow specifying GPU ID, rather than assuming 0!
-                fprintf(stderr, " <hex mask> has a bit set for each TPC to be enabled\n");
+        if (use_gpcs)
-                return 1;
+                libsmctrl_get_gpc_info_ext_easy(&num_xpcs, &masks, 0);
-        }
+        else
-        // TODO: Use a proper argument parser
+                libsmctrl_get_tpc_info_cuda(&num_xpcs, 0);
-        if (strcmp("-p", argv[1]) == 0) { // Setting mask on running task
+        uint128_t mask = 0;
-                char *end;
+        int range_start_xpc = -1;
-                pid_t target_pid = strtoul(argv[2], &end, 10);
+        char* start = list;
-                // strtoul stores a pointer to the first invalid character in `end`
+        int len = strlen(list);
-                if (*end != '\0') {
+        // Convert comma-seperated GPC/TPC list into a mask
-                        fprintf(stderr, "Invalid character \"%c\" in PID argument.\n", *end);
+        for (int i = 0; i < len + 1; i++) {
-                        return 1;
+                if (list[i] == ',' || list[i] == '\0') {
+                        list[i] = '\0';
+                        int xpc = atoi(start);
+                        if (xpc > num_xpcs - 1)
+                                error(1, EINVAL, "%s is not a valid %s ID", start, use_gpcs ? "GPC" : "TPC");
+                        // Handle ranges
+                        if (range_start_xpc != -1) {
+                                if (range_start_xpc >= xpc)
+                                        error(1, EINVAL, "Malformed %s range", use_gpcs ? "GPC" : "TPC");
+                                while (range_start_xpc <= xpc) {
+                                        if (use_gpcs)
+                                                mask |= masks[range_start_xpc];
+                                        else
+                                                mask |= (uint128_t)1 << range_start_xpc;
+                                        range_start_xpc++;
+                                }
+                                range_start_xpc = -1;
+                        } else {
+                                if (use_gpcs)
+                                        mask |= masks[xpc];
+                                else
+                                        mask |= (uint128_t)1 << xpc;
+                        }
+                        start = list + i + 1;
                }
-                unsigned __int128 mask = strtou128(argv[3], &end, 16);
+                // Range start
-                if (*end != '\0') {
+                if (list[i] == '-') {
-                        fprintf(stderr, "Invalid character \"%c\" in mask argument.\n", *end);
+                        list[i] = '\0';
-                        return 1;
+                        range_start_xpc = atoi(start);
+                        start = list + i + 1;
                }
-                // The shared memory lookup key is the lower 16-bits of the PID | "sm"
+        }
-                key_t shm_key = target_pid << 16 | (int)'s' << 8 | (int) 'm';
+        *mask_out = mask;
-                // Get a handle to the 128-bit shared memory region
+        return 0;
-                int shmid = shmget(shm_key, 16, 0);
+}
-                if (shmid == -1)
-                        error(1, errno, "Unable to find control region for PID %d", target_pid);
+// Always returns a valid string
-                // Open the shared memory region
+char* compose_list(uint128_t mask) {
-                unsigned __int128 *supreme_mask = shmat(shmid, NULL, 0);
+        // List will always be shorter than every TPC, comma-seperated
-                if (supreme_mask == (void*)-1)
+        // 128 TPCs, with 10 1-char, 90 2-char, 28 3-char, 127 commas, and 1 null
-                        error(1, errno, "Unable to open control region for PID %d", target_pid);
+        static char list[10 + 90*2 + 28*3 + 128];
-                // Write the requested mask into the shared memory region
+        char* tail = list;
-                *supreme_mask = mask;
+        int last_enabled = -2;
-        } else { // Starting a new task with a mask
+        bool in_range;
-                // TODO: Check other locations for nvidia-cuda-mps-control if its not on the path
+        for (int i = 0; i < 128; i++) {
-                // TODO: Use dup2() to redirect MPS startup messages
+                bool enabled = (mask >> i) & 1;
-                int ret = system("echo -n | nvidia-cuda-mps-control");
+                if (in_range) {
-                if (ret == -1)
+                        if (enabled) {
-                        error(1, errno, "Unable to run subshell to check MPS status");
+                                last_enabled = i;
-                if (ret != 0) { // Control deamon not yet started
+                        } else {
-                        fprintf(stderr, "nvtaskset: MPS control deamon does not appear to be running. Automatically starting...\n");
+                                tail += sprintf(tail, "%d,", last_enabled);
-                        ret = system("nvidia-cuda-mps-control -d");
+                                in_range = false;
-                        if (ret == -1)
-                                error(1, errno, "Unable to run subshell to start MPS");
-                        if (ret == 1) {
-                                fprintf(stderr, "nvtaskset: Error starting MPS control deamon. Terminating...\n");
-                                return 1;
                        }
-                        fprintf(stderr, "nvtaskset: Done. Use \"echo quit | nvidia-cuda-mps-control\" to terminate it later as desired.\n");
+                        continue;
+                }
+                if (enabled) {
+                        if (last_enabled == i - 1) {
+                                in_range = true;
+                                tail += sprintf(tail, "-");
+                        } else {
+                                tail += sprintf(tail, "%d", i);
+                        }
+                        last_enabled = i;
+                } else {
+                        if (last_enabled == i - 1) {
+                                tail += sprintf(tail, ",");
+                        }
+                }
+        }
+        // Strip trailing comma
+        if (*(tail - 1) == ',')
+                *(tail - 1) = '\0';
+        return list;
+}
+// Always returns a valid string
+// (Terminates the program on error)
+char* compose_gpc_list(uint128_t mask) {
+        uint32_t num_gpcs = 0;
+        uint128_t* masks = NULL;
+        libsmctrl_get_gpc_info_ext_easy(&num_gpcs, &masks, 0);
+        uint128_t gpc_mask = 0;
+        // Try to find correspondence between a list of TPCs and GPCs
+        for (int gpc = 0; gpc < num_gpcs; gpc++) {
+                if ((masks[gpc] & mask) == masks[gpc]) {
+                        gpc_mask |= 1 << gpc;
+                        mask &= ~masks[gpc];
                }
-                // Tell loader to initialize libsmctrl.so first
+        }
-                // TODO: Append, rather than overwrite LD_PRELOAD
+        if (mask)
-                setenv("LD_PRELOAD", "libsmctrl.so", 1);
+                error(1, EINVAL, "Unable to interpret affinity as GPC list; try -t instead of -g");
-                // Explictly set the number of channels, otherwise CUDA will only use two
+        return compose_list(gpc_mask);
-                // (see paper for why that causes problems)
+}
-                setenv("CUDA_DEVICE_MAX_CONNECTIONS", "8", 1);
-                // Check if a mask, or a list of GPCs is being provided
-                if (strcmp(argv[1], "--gpc-list") == 0) {
+uint128_t* get_mask_hndl(pid_t target_pid) {
-                        // TODO: Support the full syntax that taskset supports
+        char fd_path[277];
-                        // We just support X,Y,Z for now
+        int fd;
-                        uint32_t num_gpcs = 0;
+        uint128_t *mask_hndl;
-                        uint128_t* masks = NULL;
+        DIR *dp;
-                        // TODO: Allow specifying GPU ID, rather than assuming 0!
+        struct dirent *entry;
-                        libsmctrl_get_gpc_info_ext_easy(&num_gpcs, &masks, 0);
+        // Search for the file descriptor which represents the libsmctrl control
-                        uint128_t mask = 0;
+        // region.
-                        int range_start_gpc = -1;
+        snprintf(fd_path, 277, "/proc/%d/fd/", target_pid);
-                        char* start = argv[2];
+        if (!(dp = opendir(fd_path))) {
-                        int len = strlen(argv[2]);
+                if (errno == ENOENT)
-                        // TODO: Handle invalid input cleanly.
+                        error(1, 0, "Unable to find PID %d.", target_pid);
-                        // Convert comma-seperated GPC list into a mask
+                else
-                        for (int i = 0; i < len + 1; i++) {
+                        error(1, errno, "Unable to access PID %d", target_pid);
-                                if (argv[2][i] == ',' || argv[2][i] == '\0') {
+        }
-                                        argv[2][i] = '\0';
+        while (entry = readdir(dp)) {
-                                        int gpc = atoi(start);
+                char link[sizeof(LINK_NAME)];
-                                        if (gpc > num_gpcs - 1) {
+                snprintf(fd_path, 277, "/proc/%d/fd/%s", target_pid, entry->d_name);
-                                                fprintf(stderr, "Invalid GPC ID '%s'!\n", start);
+                readlink(fd_path, link, sizeof(LINK_NAME));
+                if (strncmp(LINK_NAME, link, sizeof(LINK_NAME) - 1) == 0)
+                        break;
+        }
+        closedir(dp);
+        if (!entry)
+                error(1, 0, "Unable to find libsmctrl-wrapper control region for PID %d.", target_pid);
+        // Access the shared memory region for libsmctrl control.
+        if ((fd = open(fd_path, O_RDWR)) == -1)
+                error(1, errno, "Unable to open libsmctrl-wrapper control file %s", fd_path);
+        mask_hndl = mmap(NULL, 16, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        if (mask_hndl == MAP_FAILED)
+                error(1, errno, "Unable to memory-map libsmctrl-wrapper control file %s", fd_path);
+        close(fd);
+        return mask_hndl;
+}
+static error_t arg_parser(int key, char* arg, struct argp_state *state){
+        static bool is_cmd = true;
+        static bool is_query = false;
+        static bool is_list = false;
+        static bool use_gpcs = false;
+        static uint128_t mask = 0;
+        static pid_t target_pid = 0;
+        static char **sub_argv = NULL;
+        char *end;
+        // Handle what to do in case of each option
+        switch (key) {
+                case 'g':
+                        if (is_list)
+                                argp_error(state, "Only one of -g and -t may be specified.\n");
+                        use_gpcs = true;
+                        is_list = true;
+                        break;
+                case 't':
+                        if (is_list)
+                                argp_error(state, "Only one of -g and -t may be specified.\n");
+                        is_list = true;
+                        break;
+                case 'p':
+                        is_cmd = false;
+                        break;
+                case ARGP_KEY_ARG:
+                        // Options:
+                        // 1. -p and one argument -> Query mask for PID
+                        // 2. -p and two arguments -> Set mask for PID
+                        // 3. No -p and at least one argument -> Set mask and launch command
+                        // (otherwise: invalid)
+                        if (state->arg_num == 0 && !is_cmd && state->argc - state->next == 0)
+                                is_query = true;
+                        // Handle invalid and valid query cases
+                        if (is_query) {
+                                if (state->arg_num == 0) {
+                                        target_pid = strtoul(arg, &end, 10);
+                                        if (*end != '\0')
+                                                argp_error(state, "Invalid character \"%c\" in PID argument.\n", *end);
+                                        break;
+                                } else
+                                        return ARGP_ERR_UNKNOWN;
+                        }
+                        // Handle non-query cases
+                        if (state->arg_num == 0 && state->argc - state->next != 0) {
+                                if (is_list) {
+                                        parse_list(use_gpcs, arg, &mask);
+                                } else {
+                                        // strtoul stores a pointer to the first invalid character in `end`
+                                        mask = strtou128(arg, &end, 16);
+                                        if (*end != '\0')
+                                                argp_error(state, "Invalid character \"%c\" in mask argument.\n", *end);
+                                }
+                        } else if (state->arg_num == 1 && !is_cmd) {
+                                target_pid = strtoul(arg, &end, 10);
+                                if (*end != '\0')
+                                        argp_error(state, "Invalid character \"%c\" in PID argument.\n", *end);
+                        } else
+                                return ARGP_ERR_UNKNOWN;
+                        break;
+                case ARGP_KEY_ARGS:
+                        if (!is_cmd)
+                                return ARGP_ERR_UNKNOWN;
+                        sub_argv = state->argv + state->next;
+                        break;
+                case ARGP_KEY_END:
+                        if (is_query && state->arg_num < 1)
+                                argp_usage(state);
+                        else if (!is_query && state->arg_num < 2)
+                                argp_usage(state);
+                        break;
+                case ARGP_KEY_FINI:
+                        if (is_query) {
+                                // query PID
+                                uint128_t* mask_hndl = get_mask_hndl(target_pid);
+                                uint128_t enable_mask = ~*mask_hndl;
+                                if (use_gpcs & is_list)
+                                        printf("PID %d's current GPC affinity list: %s\n", target_pid, compose_gpc_list(enable_mask));
+                                else if (use_gpcs & !is_list)
+                                        argp_error(state, "Unsupported to print query as a GPC mask.\n");
+                                else if (is_list)
+                                        printf("PID %d's current TPC affinity list: %s\n", target_pid, compose_list(enable_mask));
+                                else
+                                        printf("PID %d's current TPC affinity mask: 0x%.0lx%016lx\n", target_pid, (uint64_t)(enable_mask >> 64), (uint64_t)enable_mask);
+                        } else if (is_cmd) {
+                                // start MPS (as needed)
+                                if (!libsmctrl_is_mps_running()) {
+                                        fprintf(stderr, "nvtaskset: MPS control deamon does not appear to be running. Automatically starting...\n");
+                                        int ret = system("nvidia-cuda-mps-control -d");
+                                        if (ret == -1)
+                                                error(1, errno, "Unable to run subshell to start MPS");
+                                        if (ret == 1) {
+                                                fprintf(stderr, "nvtaskset: Error starting MPS control deamon. Terminating...\n");
                                                return 1;
                                        }
-                                        // Handle ranges
+                                        fprintf(stderr, "nvtaskset: Done. Use \"echo quit | nvidia-cuda-mps-control\" to terminate it later as desired.\n");
-                                        if (range_start_gpc != -1) {
-                                                if (range_start_gpc >= gpc) {
-                                                        fprintf(stderr, "Invalid GPC range!\n");
-                                                        return 1;
-                                                }
-                                                while (range_start_gpc <= gpc) {
-                                                        //printf("gpc %i\n", range_start_gpc);
-                                                        mask |= masks[range_start_gpc];
-                                                        range_start_gpc++;
-                                                }
-                                                range_start_gpc = -1;
-                                        } else {
-                                                //printf("gpc %i\n", gpc);
-                                                mask |= masks[gpc];
-                                        }
-                                        start = argv[2] + i + 1;
                                }
-                                // Range start
+                                // launch subprocess
-                                if (argv[2][i] == '-') {
+                                // Convert to string, prefix with ~, and set env var
-                                        argv[2][i] = '\0';
+                                char mask_str[32+3+1]; // 32 hexits, "~0x", and '\0'
-                                        range_start_gpc = atoi(start);
+                                snprintf(mask_str, 36, "~0x%.0lx%016lx", (uint64_t)(mask >> 64), (uint64_t)mask);
-                                        start = argv[2] + i + 1;
+                                setenv("LIBSMCTRL_MASK", mask_str, 1);
+                                // Start task
+                                execvp(sub_argv[0], sub_argv);
+                                error(1, errno, "Unable to launch task '%s'", sub_argv[0]);
+                        } else {
+                                if (!libsmctrl_is_mps_running())
+                                        printf("Warning: NVIDIA MPS is not running. CUDA programs will not co-run! Run nvidia-cuda-mps-control -d before launching any CUDA-using programs that should co-run.\n");
+                                // change mask on PID
+                                uint128_t* mask_hndl = get_mask_hndl(target_pid);
+                                if (!is_list) {
+                                        printf("PID %d's current TPC affinity mask: 0x%.0lx%016lx\n", target_pid, ~(uint64_t)(*mask_hndl >> 64), ~(uint64_t)*mask_hndl);
+                                        printf("PID %d's new TPC affinity mask: 0x%.0lx%016lx\n", target_pid, (uint64_t)(mask >> 64), (uint64_t)mask);
+                                } else {
+                                        printf("PID %d's current TPC affinity list: %s\n", target_pid, compose_list(~*mask_hndl));
+                                        printf("PID %d's new TPC affinity list: %s\n", target_pid, compose_list(mask));
                                }
+                                // Write the requested mask into the shared memory region
+                                *mask_hndl = ~mask;
                        }
-                        // Convert to string, prefix with ~, and set env var
+                        break;
-                        char mask_str[32+3+1]; // 32 hexits, "~0x", and '\0'
+                default:
-                        snprintf(mask_str, 36, "~0x%lx%016lx", (uint64_t)(mask >> 64), (uint64_t)mask);
+                        return ARGP_ERR_UNKNOWN;
-                        //printf("nvtaskset: Using mask string %s\n", mask_str);
-                        setenv("LIBSMCTRL_MASK", mask_str, 1);
-                        // Start task
-                        execvp(argv[3], argv+3);
-                        error(1, errno, "Unable to launch task '%s'", argv[3]);
-                } else {
-                        // Tell libsmctrl what mask to use
-                        char* mask = malloc(strlen(argv[1]) + 2);
-                        mask[0] = '~'; // Make an enable mask
-                        strcpy(mask+1, argv[1]);
-                        setenv("LIBSMCTRL_MASK", mask, 1);
-                        free(mask); // setenv() made a copy
-                        // Start task
-                        execvp(argv[2], argv+2);
-                        error(1, errno, "Unable to launch task '%s'", argv[2]);
-                }
        }
-        fprintf(stderr, "Invalid arguments\n");
+        return 0;
-        return 1;
+}
+struct argp argp = {opts, arg_parser, args_doc, desc};
+int main(int argc, char **argv) {
+        argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, NULL);
+        return 0;
 }
author	Joshua Bakita <jbakita@cs.unc.edu>	2025-06-16 19:29:07 -0400
committer	Joshua Bakita <jbakita@cs.unc.edu>	2025-06-17 14:01:49 -0400
commit	89177fce34edb5ad0059a41548888d05588cc1c5 (patch)
tree	096dc302bb5e17e3987c45a59ef02c69ec73e9ed
parent	03ae77e35d35b2a82f5387d1903cfa954b696edd (diff)