diff options
| author | Joshua Bakita <jbakita@cs.unc.edu> | 2025-06-16 19:29:07 -0400 |
|---|---|---|
| committer | Joshua Bakita <jbakita@cs.unc.edu> | 2025-06-17 14:01:49 -0400 |
| commit | 89177fce34edb5ad0059a41548888d05588cc1c5 (patch) | |
| tree | 096dc302bb5e17e3987c45a59ef02c69ec73e9ed | |
| parent | 03ae77e35d35b2a82f5387d1903cfa954b696edd (diff) | |
Rewrite nvtaskset and implementation of partitioning for unmodified tasks
Rather than requiring libsmctrl.so to be preloaded, we now wrap
libcuda.so.1. All CUDA-using applications will load libcuda.so.1,
ensuring that our wrapper will always be dynamically loaded, no
matter if LD_PRELOAD is enabled, or if a program has been staticly
linked. All that needs to be done is that the location of our
"fake" libcuda.so.1 need to be put within the loader search path.
This can be done by setting LD_LIBRARY_PATH, or by installing
our wrapper into /lib/x86_64-linux-gnu.
The mask can still be set via the LIBSMCTRL_MASK environment
variable, but the easier-to-use nvtaskset tool is now the
recommended way to view or change the supreme TPC mask for any
CUDA-using application. This allows launching a program on the
first two GPCs via a command as simple as:
./nvtaskset -g 0-1 ./a_program a_program_args
(Note that use of the -g option requires the nvdebug kernel module
to first be loaded.)
These changes support the final version of the ECRTS'25 paper.
Note that nvtaskset does not yet fully support multi-GPU systems.
Bugfixes:
- Fix crash that would occur if both libsmctrl.so and libsmctrl.a
were built into an application.
- Correctly use GPU ID when initializing a context in
`libsmctrl_test_gpc_info`.
- Include `nvtaskset` as a prerequisite for
`libsmctrl_test_supreme_mask`.
- Fix malfunction of `libsmctrl_test_gpc_info` if
CUDA_VISIBLE_DEVICES is set.
Other minor changes:
- Adds make target to run all the tests.
- Fixes typos in comments.
- Enables -Wall build option.
- Upgrades supreme mask from 64 to 128 bits.
- Removes `detect_parker_soc()` from the global namespace.
- Adjusts test messages to be more succinct.
- Updates README with overview of how to partition unmodified
applications, more details on the tests, and information on the
new ECRTS'25 paper.
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | Makefile | 47 | ||||
| -rw-r--r-- | README.md | 83 | ||||
| -rw-r--r-- | libsmctrl.c | 212 | ||||
| -rw-r--r-- | libsmctrl_test_gpc_info.c | 4 | ||||
| -rw-r--r-- | libsmctrl_test_mask_shared.cu | 31 | ||||
| -rw-r--r-- | nvtaskset.c | 520 |
7 files changed, 659 insertions, 240 deletions
| @@ -8,6 +8,8 @@ libsmctrl_test_stream_mask | |||
| 8 | libsmctrl_test_stream_mask_override | 8 | libsmctrl_test_stream_mask_override |
| 9 | libsmctrl_test_next_mask | 9 | libsmctrl_test_next_mask |
| 10 | libsmctrl_test_next_mask_override | 10 | libsmctrl_test_next_mask_override |
| 11 | libcuda.so.1 | ||
| 12 | nvtaskset | ||
| 11 | *.pyc | 13 | *.pyc |
| 12 | *.o | 14 | *.o |
| 13 | .gdb_history | 15 | .gdb_history |
| @@ -3,9 +3,11 @@ CUDA ?= /usr/local/cuda | |||
| 3 | # Note that CXX and CC are predefined as g++ and cc (respectively) by Make | 3 | # Note that CXX and CC are predefined as g++ and cc (respectively) by Make |
| 4 | NVCC ?= $(CUDA)/bin/nvcc | 4 | NVCC ?= $(CUDA)/bin/nvcc |
| 5 | # Everything has to have -lcuda, as it's needed for libsmctrl | 5 | # Everything has to have -lcuda, as it's needed for libsmctrl |
| 6 | LDFLAGS := -lcuda -I$(CUDA)/include -L$(CUDA)/lib64 | 6 | LDFLAGS := -ldl -lcuda -I$(CUDA)/include -L$(CUDA)/lib64 |
| 7 | ARCH = $(shell $(CC) -dumpmachine) | ||
| 8 | CFLAGS := -Wall -Wno-parentheses | ||
| 7 | 9 | ||
| 8 | .PHONY: clean tests all | 10 | .PHONY: clean tests all install remove run_tests |
| 9 | 11 | ||
| 10 | # ----- Main Library ----- | 12 | # ----- Main Library ----- |
| 11 | libsmctrl.so: libsmctrl.c libsmctrl.h | 13 | libsmctrl.so: libsmctrl.c libsmctrl.h |
| @@ -14,9 +16,14 @@ libsmctrl.so: libsmctrl.c libsmctrl.h | |||
| 14 | # -fPIC is needed even if built as a static library, in case we are linked into | 16 | # -fPIC is needed even if built as a static library, in case we are linked into |
| 15 | # another shared library | 17 | # another shared library |
| 16 | libsmctrl.a: libsmctrl.c libsmctrl.h | 18 | libsmctrl.a: libsmctrl.c libsmctrl.h |
| 17 | $(CC) $< -c -o libsmctrl.o -fPIC $(CFLAGS) $(LDFLAGS) | 19 | $(CC) $< -c -o libsmctrl.o -fPIC -DLIBSMCTRL_STATIC $(CFLAGS) $(LDFLAGS) |
| 18 | ar rcs $@ libsmctrl.o | 20 | ar rcs $@ libsmctrl.o |
| 19 | 21 | ||
| 22 | # ----- CUDA Wrapper ----- | ||
| 23 | libcuda.so.1: libsmctrl.c libsmctrl.h | ||
| 24 | $(CC) $< -shared -o $@ -fPIC -DLIBSMCTRL_WRAPPER $(CFLAGS) $(LDFLAGS) | ||
| 25 | patchelf libcuda.so.1 --add-needed libcuda.so | ||
| 26 | |||
| 20 | # ----- Utilities ----- | 27 | # ----- Utilities ----- |
| 21 | # Use static linking with tests to avoid LD_LIBRARY_PATH issues | 28 | # Use static linking with tests to avoid LD_LIBRARY_PATH issues |
| 22 | nvtaskset: nvtaskset.c libsmctrl.so libsmctrl.a | 29 | nvtaskset: nvtaskset.c libsmctrl.so libsmctrl.a |
| @@ -29,7 +36,7 @@ libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c libsmctrl.a testbench.h | |||
| 29 | libsmctrl_test_mask_shared.o: libsmctrl_test_mask_shared.cu testbench.h | 36 | libsmctrl_test_mask_shared.o: libsmctrl_test_mask_shared.cu testbench.h |
| 30 | $(NVCC) -ccbin $(CXX) $< -c -g | 37 | $(NVCC) -ccbin $(CXX) $< -c -g |
| 31 | 38 | ||
| 32 | libsmctrl_test_supreme_mask: libsmctrl_test_supreme_mask.c libsmctrl.a libsmctrl_test_mask_shared.o | 39 | libsmctrl_test_supreme_mask: libsmctrl_test_supreme_mask.c libsmctrl.a libsmctrl_test_mask_shared.o libcuda.so.1 nvtaskset |
| 33 | $(NVCC) -ccbin $(CXX) $@.c -o $@ libsmctrl_test_mask_shared.o -g -L. -l:libsmctrl.a $(LDFLAGS) | 40 | $(NVCC) -ccbin $(CXX) $@.c -o $@ libsmctrl_test_mask_shared.o -g -L. -l:libsmctrl.a $(LDFLAGS) |
| 34 | 41 | ||
| 35 | libsmctrl_test_global_mask: libsmctrl_test_global_mask.c libsmctrl.a libsmctrl_test_mask_shared.o | 42 | libsmctrl_test_global_mask: libsmctrl_test_global_mask.c libsmctrl.a libsmctrl_test_mask_shared.o |
| @@ -52,7 +59,7 @@ tests: libsmctrl_test_gpc_info libsmctrl_test_supreme_mask \ | |||
| 52 | libsmctrl_test_stream_mask_override libsmctrl_test_next_mask \ | 59 | libsmctrl_test_stream_mask_override libsmctrl_test_next_mask \ |
| 53 | libsmctrl_test_next_mask_override | 60 | libsmctrl_test_next_mask_override |
| 54 | 61 | ||
| 55 | all: libsmctrl.so nvtaskset tests | 62 | all: libsmctrl.so libcuda.so.1 nvtaskset tests |
| 56 | 63 | ||
| 57 | clean: | 64 | clean: |
| 58 | rm -f libsmctrl.so libsmctrl.o libsmctrl.a libsmctrl_test_gpc_info \ | 65 | rm -f libsmctrl.so libsmctrl.o libsmctrl.a libsmctrl_test_gpc_info \ |
| @@ -60,4 +67,32 @@ clean: | |||
| 60 | libsmctrl_test_global_mask \ | 67 | libsmctrl_test_global_mask \ |
| 61 | libsmctrl_test_stream_mask libsmctrl_test_stream_mask_override \ | 68 | libsmctrl_test_stream_mask libsmctrl_test_stream_mask_override \ |
| 62 | libsmctrl_test_next_mask libsmctrl_test_next_mask_override \ | 69 | libsmctrl_test_next_mask libsmctrl_test_next_mask_override \ |
| 63 | nvtaskset | 70 | nvtaskset libcuda.so.1 |
| 71 | |||
| 72 | install: libcuda.so.1 | ||
| 73 | @# Check that CUDA is installed first | ||
| 74 | test -f /lib/$(ARCH)/libcuda.so.*.* | ||
| 75 | @# Change libcuda.so link to bypass libcuda.so.1 | ||
| 76 | sudo ln -sf /lib/$(ARCH)/libcuda.so.*.* /lib/$(ARCH)/libcuda.so | ||
| 77 | @# Remove libcuda.so.1 symlink | ||
| 78 | sudo rm /lib/$(ARCH)/libcuda.so.1 | ||
| 79 | @# Install wrapper as libcuda.so.1 | ||
| 80 | sudo cp libcuda.so.1 /lib/$(ARCH)/libcuda.so.1 | ||
| 81 | |||
| 82 | remove: | ||
| 83 | @# Test that our library in installed first | ||
| 84 | test ! -L /lib/$(ARCH)/libcuda.so.1 | ||
| 85 | @# Overwrite install with original symlinks | ||
| 86 | sudo ln -sf libcuda.so.1 /lib/$(ARCH)/libcuda.so | ||
| 87 | sudo ln -sf /lib/$(ARCH)/libcuda.so.*.* /lib/$(ARCH)/libcuda.so.1 | ||
| 88 | |||
| 89 | run_tests: tests | ||
| 90 | ./libsmctrl_test_global_mask | ||
| 91 | ./libsmctrl_test_next_mask | ||
| 92 | ./libsmctrl_test_stream_mask | ||
| 93 | ./libsmctrl_test_next_mask_override | ||
| 94 | ./libsmctrl_test_stream_mask_override | ||
| 95 | @# Must set LD_LIBRARY_PATH in case make install has not been run | ||
| 96 | LD_LIBRARY_PATH=. ./libsmctrl_test_supreme_mask | ||
| 97 | ./libsmctrl_test_gpc_info | ||
| 98 | @ echo "All tests passed!" | ||
| @@ -13,16 +13,37 @@ Please cite this paper in any work which leverages our library. Here's the BibTe | |||
| 13 | year={2023}, | 13 | year={2023}, |
| 14 | month={May}, | 14 | month={May}, |
| 15 | pages={54--66}, | 15 | pages={54--66}, |
| 16 | doi={10.1109/RTAS58335.2023.00012}, | ||
| 16 | _series={RTAS} | 17 | _series={RTAS} |
| 17 | } | 18 | } |
| 18 | ``` | 19 | ``` |
| 19 | 20 | ||
| 20 | Please see [the paper](https://www.cs.unc.edu/~jbakita/rtas23.pdf) and `libsmctrl.h` for details and examples of how to use this library. | 21 | The ability for `libsmctrl` to work on unmodified tasks was developed as part of a follow-up paper: |
| 22 | |||
| 23 | _J. Bakita and J. H. Anderson, "Hardware Compute Partitioning on NVIDIA GPUs for Composable Systems", Proceedings of the 37th Euromicro Conference on Real-Time Systems, pp. 18:1-18:24, July 2025._ | ||
| 24 | |||
| 25 | Please cite this paper in any work which uses this for partitioning unmodified tasks. Here's the BibTeX entry: | ||
| 26 | ``` | ||
| 27 | @inproceedings{bakita2025hardware, | ||
| 28 | title={Hardware Compute Partitioning on {NVIDIA} {GPUs} for Composable Systems}, | ||
| 29 | author={Bakita, Joshua and Anderson, James H}, | ||
| 30 | booktitle={Proceedings of the 37th Euromicro Conference on Real-Time Systems}, | ||
| 31 | year={2025}, | ||
| 32 | month={July}, | ||
| 33 | pages={18:1--18:24}, | ||
| 34 | doi={10.1109/ECRTS.2025.18}, | ||
| 35 | _series={ECRTS} | ||
| 36 | } | ||
| 37 | ``` | ||
| 38 | |||
| 39 | Please see [the first paper](https://www.cs.unc.edu/~jbakita/rtas23.pdf), [the second paper](https://www.cs.unc.edu/~jbakita/ecrts25.pdf) and `libsmctrl.h` for details and examples of how to use this library. | ||
| 21 | We strongly encourage consulting those resources first; the below comments serve merely as an appendum. | 40 | We strongly encourage consulting those resources first; the below comments serve merely as an appendum. |
| 22 | 41 | ||
| 23 | ## Run-time Dependencies | 42 | ## Run-time Dependencies |
| 24 | `libcuda.so`, which is automatically installed by the NVIDIA GPU driver. | 43 | `libcuda.so`, which is automatically installed by the NVIDIA GPU driver. |
| 25 | 44 | ||
| 45 | (Technically `libdl` is also required, but this should never need to be manually installed. This is a dependency of CUDA, and is also part of the GNU C Standard Library starting with version 2.34.) | ||
| 46 | |||
| 26 | ## Building | 47 | ## Building |
| 27 | To build, ensure that you have `gcc` installed and access to the CUDA SDK including `nvcc`. Then run: | 48 | To build, ensure that you have `gcc` installed and access to the CUDA SDK including `nvcc`. Then run: |
| 28 | ``` | 49 | ``` |
| @@ -66,8 +87,52 @@ nvcc benchmark.cu -o benchmark -I/playpen/libsmctl -lsmctrl -lcuda -L/playpen/li | |||
| 66 | ``` | 87 | ``` |
| 67 | The resultant `benchmark` binary should be portable to any system with an equivalent or newer version of the NVIDIA GPU driver installed. | 88 | The resultant `benchmark` binary should be portable to any system with an equivalent or newer version of the NVIDIA GPU driver installed. |
| 68 | 89 | ||
| 90 | ## Use Without Application Modification | ||
| 91 | As an alternative to modifying your application, `libsmctrl` can be installed system-wide, and partitions for each application can be set via the `nvtaskset` tool. | ||
| 92 | The `nvtaskset` tool works very similarly to the Linux CPU-affinity-setting tool `taskset`. | ||
| 93 | |||
| 94 | To install `libsmctrl` system-wide, such that all CUDA-using applications automatically load it, ensure that `patchelf` is installed (`sudo apt install patchelf`), and run: | ||
| 95 | ``` | ||
| 96 | make libcuda.so.1 install | ||
| 97 | ``` | ||
| 98 | Or, if you do not want to modify any system-wide state, and only want `libsmctrl` loaded as part of anything run from this console: | ||
| 99 | ``` | ||
| 100 | make libcuda.so.1 | ||
| 101 | export LD_LIBRARY_PATH=$(pwd) | ||
| 102 | ``` | ||
| 103 | (This works because CUDA is always dynamically loaded from `libcuda.so.1`, and `lbsmctrl` creates a "fake" `libcuda.so.1` in this directory that wraps CUDA. | ||
| 104 | Setting `LD_LIBRARY_PATH` ensures that the wrapped version is the first one loaded. | ||
| 105 | The only difference with running `make install` is that it copies our "fake" `libcuda.so.1` to a location where the loader will automatically find it.) | ||
| 106 | |||
| 107 | And then to start an application within a specific TPC partition, e.g., the first 10 TPCs: | ||
| 108 | ``` | ||
| 109 | ./nvtaskset -t 0-9 my_program my_args | ||
| 110 | ``` | ||
| 111 | Note that this will automatically start NVIDIA MPS, which is a prerequisite to co-run tasks on NVIDIA GPUs without timeslicing. | ||
| 112 | |||
| 113 | And to change the TPCs available for a process ID 1234 to to the first 10 TPCs: | ||
| 114 | ``` | ||
| 115 | ./nvtaskset -tp 0-9 1234 | ||
| 116 | ``` | ||
| 117 | |||
| 118 | Or, to change a process of ID 1234 to only run on GPC 3: | ||
| 119 | ``` | ||
| 120 | ./nvtaskset -gp 3 1234 | ||
| 121 | ``` | ||
| 122 | |||
| 123 | To remove the system-wide installation of `libsmctrl`, run: | ||
| 124 | ``` | ||
| 125 | make remove | ||
| 126 | ``` | ||
| 127 | |||
| 69 | ## Run Tests | 128 | ## Run Tests |
| 70 | To test partitioning: | 129 | |
| 130 | To run them all: | ||
| 131 | ``` | ||
| 132 | make run_tests | ||
| 133 | ``` | ||
| 134 | |||
| 135 | If you prefer to run them individually, to test partitioning: | ||
| 71 | ``` | 136 | ``` |
| 72 | make tests | 137 | make tests |
| 73 | ./libsmctrl_test_global_mask | 138 | ./libsmctrl_test_global_mask |
| @@ -82,18 +147,26 @@ make tests | |||
| 82 | ./libsmctrl_test_next_mask_override | 147 | ./libsmctrl_test_next_mask_override |
| 83 | ``` | 148 | ``` |
| 84 | 149 | ||
| 85 | And if `nvdebug` has been installed: | 150 | To test that `nvtaskset` can dynamically change the mask of a running program: |
| 86 | ``` | 151 | ``` |
| 87 | make tests | 152 | make libsmctrl_test_supreme_mask |
| 153 | ./libsmctrl_test_supreme_mask | ||
| 154 | ``` | ||
| 155 | |||
| 156 | To test that TPC to GPC mappings can be obtained (if `nvdebug` has been installed): | ||
| 157 | ``` | ||
| 158 | make libsmctrl_test_gpc_info | ||
| 88 | ./libsmctrl_test_gpc_info | 159 | ./libsmctrl_test_gpc_info |
| 89 | ``` | 160 | ``` |
| 90 | 161 | ||
| 162 | The `CUDA_VISIBLE_DEVICES` environment variable can be set to run any of the partitioning tests on a different GPU. | ||
| 163 | |||
| 91 | ## Supported GPUs | 164 | ## Supported GPUs |
| 92 | 165 | ||
| 93 | #### Known Working | 166 | #### Known Working |
| 94 | 167 | ||
| 95 | - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs | 168 | - NVIDIA GPUs from compute capability 3.5 through 8.9, including embedded "Jetson" GPUs |
| 96 | - CUDA 6.5 through 12.6 | 169 | - CUDA 6.5 through 12.8 |
| 97 | - `x86_64` and Jetson `aarch64` platforms | 170 | - `x86_64` and Jetson `aarch64` platforms |
| 98 | 171 | ||
| 99 | #### Known Issues | 172 | #### Known Issues |
diff --git a/libsmctrl.c b/libsmctrl.c index 6aa471b..79d2b33 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
| @@ -17,22 +17,27 @@ | |||
| 17 | * Please contact the authors if support is needed for a particular feature on | 17 | * Please contact the authors if support is needed for a particular feature on |
| 18 | * an older CUDA version. Support for those is unimplemented, not impossible. | 18 | * an older CUDA version. Support for those is unimplemented, not impossible. |
| 19 | * | 19 | * |
| 20 | * An old implementation of this file effected the global mask on CUDA 10.2 by | 20 | * An old implementation of this file affected the global mask on CUDA 10.2 by |
| 21 | * changing a field in CUDA's global struct that CUDA applies to the QMD/TMD. | 21 | * changing a field in CUDA's global struct that CUDA applies to the QMD/TMD. |
| 22 | * That implementation was extraordinarily complicated, and was replaced in | 22 | * That implementation was extraordinarily complicated, and was replaced in |
| 23 | * 2024 with a more-backward-compatible way of hooking the TMD/QMD. | 23 | * 2024 with a more-backward-compatible way of hooking the TMD/QMD. |
| 24 | * View the old implementation via Git: `git show aa63a02e:libsmctrl.c`. | 24 | * View the old implementation via Git: `git show aa63a02e:libsmctrl.c`. |
| 25 | */ | 25 | */ |
| 26 | #define _GNU_SOURCE // To enable use of memfd_create() | ||
| 26 | #include <cuda.h> | 27 | #include <cuda.h> |
| 27 | 28 | ||
| 28 | #include <errno.h> | 29 | #include <errno.h> |
| 29 | #include <error.h> | 30 | #include <error.h> |
| 31 | #include <dlfcn.h> | ||
| 30 | #include <fcntl.h> | 32 | #include <fcntl.h> |
| 31 | #include <stdbool.h> | 33 | #include <stdbool.h> |
| 32 | #include <stdint.h> | 34 | #include <stdint.h> |
| 33 | #include <stdio.h> | 35 | #include <stdio.h> |
| 34 | #include <sys/ipc.h> | 36 | #include <string.h> |
| 35 | #include <sys/shm.h> | 37 | #include <sys/mman.h> |
| 38 | #include <sys/socket.h> | ||
| 39 | #include <sys/types.h> | ||
| 40 | #include <sys/un.h> | ||
| 36 | #include <unistd.h> | 41 | #include <unistd.h> |
| 37 | 42 | ||
| 38 | #include "libsmctrl.h" | 43 | #include "libsmctrl.h" |
| @@ -48,20 +53,33 @@ | |||
| 48 | // (No testing attempted on pre-CUDA-6.5 versions) | 53 | // (No testing attempted on pre-CUDA-6.5 versions) |
| 49 | // Values for the following three lines can be extracted by tracing CUPTI as | 54 | // Values for the following three lines can be extracted by tracing CUPTI as |
| 50 | // it interects with libcuda.so to set callbacks. | 55 | // it interects with libcuda.so to set callbacks. |
| 51 | static const CUuuid callback_funcs_id = {0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}; | 56 | static const CUuuid callback_funcs_id = {{0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}}; |
| 52 | // These callback descriptors appear to intercept the TMD/QMD late enough that | 57 | // These callback descriptors appear to intercept the TMD/QMD late enough that |
| 53 | // CUDA has already applied the per-stream mask from its internal data | 58 | // CUDA has already applied the per-stream mask from its internal data |
| 54 | // structures, allowing us to override it with the next mask. | 59 | // structures, allowing us to override it with the next mask. |
| 55 | #define QMD_DOMAIN 0xb | 60 | #define QMD_DOMAIN 0xb |
| 56 | #define QMD_PRE_UPLOAD 0x1 | 61 | #define QMD_PRE_UPLOAD 0x1 |
| 62 | /** | ||
| 63 | * These globals must be non-static (i.e., have global linkage) to ensure that | ||
| 64 | * if multiple copies of the library are loaded (e.g., dynamically linked to | ||
| 65 | * both this program and a dependency), secondary copies do not attempt to | ||
| 66 | * repeat initialization or make changes to unused copies of mask values. | ||
| 67 | */ | ||
| 57 | // Supreme mask (cannot be overridden) | 68 | // Supreme mask (cannot be overridden) |
| 58 | static uint64_t *g_supreme_sm_mask = NULL; | 69 | uint128_t *g_supreme_sm_mask = NULL; |
| 59 | // Global mask (applies across all threads) | 70 | // Global mask (applies across all threads) |
| 60 | static uint64_t g_sm_mask = 0; | 71 | uint64_t g_sm_mask = 0; |
| 61 | // Next mask (applies per-thread) | 72 | // Next mask (applies per-thread) |
| 62 | static __thread uint64_t g_next_sm_mask = 0; | 73 | __thread uint64_t g_next_sm_mask = 0; |
| 63 | // Flag value to indicate if setup has been completed | 74 | // Flag value to indicate if setup has been completed |
| 64 | static bool sm_control_setup_called = false; | 75 | bool sm_control_setup_called = false; |
| 76 | |||
| 77 | #ifdef LIBSMCTRL_STATIC | ||
| 78 | // Special handling for if built as a static library, and the libcuda.so.1 | ||
| 79 | // libsmctrl wrapper is in use (see comment on setup() constructor for detail). | ||
| 80 | static void (*shared_set_global_mask)(uint64_t) = NULL; | ||
| 81 | static void (*shared_set_next_mask)(uint64_t) = NULL; | ||
| 82 | #endif | ||
| 65 | 83 | ||
| 66 | // v1 has been removed---it intercepted the TMD/QMD too early, making it | 84 | // v1 has been removed---it intercepted the TMD/QMD too early, making it |
| 67 | // impossible to override the CUDA-injected stream mask with the next mask. | 85 | // impossible to override the CUDA-injected stream mask with the next mask. |
| @@ -78,7 +96,7 @@ static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in | |||
| 78 | if (!tmd) | 96 | if (!tmd) |
| 79 | abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n"); | 97 | abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n"); |
| 80 | 98 | ||
| 81 | uint32_t *lower_ptr, *upper_ptr; | 99 | uint32_t *lower_ptr, *upper_ptr, *ext_lower_ptr, *ext_upper_ptr; |
| 82 | 100 | ||
| 83 | // The location of the TMD version field seems consistent across versions | 101 | // The location of the TMD version field seems consistent across versions |
| 84 | uint8_t tmd_ver = *(uint8_t*)(tmd + 72); | 102 | uint8_t tmd_ver = *(uint8_t*)(tmd + 72); |
| @@ -87,10 +105,12 @@ static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in | |||
| 87 | // TMD V04_00 is used starting with Hopper to support masking >64 TPCs | 105 | // TMD V04_00 is used starting with Hopper to support masking >64 TPCs |
| 88 | lower_ptr = tmd + 304; | 106 | lower_ptr = tmd + 304; |
| 89 | upper_ptr = tmd + 308; | 107 | upper_ptr = tmd + 308; |
| 108 | ext_lower_ptr = tmd + 312; | ||
| 109 | ext_upper_ptr = tmd + 316; | ||
| 90 | // XXX: Disable upper 64 TPCs until we have ...next_mask_ext and | 110 | // XXX: Disable upper 64 TPCs until we have ...next_mask_ext and |
| 91 | // ...global_mask_ext | 111 | // ...global_mask_ext |
| 92 | *(uint32_t*)(tmd + 312) = -1; | 112 | *ext_lower_ptr = -1; |
| 93 | *(uint32_t*)(tmd + 316) = -1; | 113 | *ext_upper_ptr = -1; |
| 94 | // An enable bit is also required | 114 | // An enable bit is also required |
| 95 | *(uint32_t*)tmd |= 0x80000000; | 115 | *(uint32_t*)tmd |= 0x80000000; |
| 96 | } else if (tmd_ver >= 0x16) { | 116 | } else if (tmd_ver >= 0x16) { |
| @@ -119,6 +139,10 @@ static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in | |||
| 119 | if (g_supreme_sm_mask) { | 139 | if (g_supreme_sm_mask) { |
| 120 | *lower_ptr |= (uint32_t)*g_supreme_sm_mask; | 140 | *lower_ptr |= (uint32_t)*g_supreme_sm_mask; |
| 121 | *upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 32); | 141 | *upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 32); |
| 142 | if (tmd_ver >= 0x40) { | ||
| 143 | *ext_lower_ptr |= (uint32_t)(*g_supreme_sm_mask >> 64); | ||
| 144 | *ext_upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 96); | ||
| 145 | } | ||
| 122 | } | 146 | } |
| 123 | 147 | ||
| 124 | //fprintf(stderr, "Final SM Mask (lower): %x\n", *lower_ptr); | 148 | //fprintf(stderr, "Final SM Mask (lower): %x\n", *lower_ptr); |
| @@ -163,12 +187,26 @@ static void setup_sm_control_callback() { | |||
| 163 | 187 | ||
| 164 | // Set default mask for all launches | 188 | // Set default mask for all launches |
| 165 | void libsmctrl_set_global_mask(uint64_t mask) { | 189 | void libsmctrl_set_global_mask(uint64_t mask) { |
| 190 | #ifdef LIBSMCTRL_STATIC | ||
| 191 | // Special handling for if built as a static library, and the libcuda.so.1 | ||
| 192 | // libsmctrl wrapper is in use (see comment on setup() constructor for | ||
| 193 | // detail). | ||
| 194 | if (shared_set_global_mask) | ||
| 195 | return (*shared_set_global_mask)(mask); | ||
| 196 | #endif | ||
| 166 | setup_sm_control_callback(); | 197 | setup_sm_control_callback(); |
| 167 | g_sm_mask = mask; | 198 | g_sm_mask = mask; |
| 168 | } | 199 | } |
| 169 | 200 | ||
| 170 | // Set mask for next launch from this thread | 201 | // Set mask for next launch from this thread |
| 171 | void libsmctrl_set_next_mask(uint64_t mask) { | 202 | void libsmctrl_set_next_mask(uint64_t mask) { |
| 203 | #ifdef LIBSMCTRL_STATIC | ||
| 204 | // Special handling for if built as a static library, and the libcuda.so.1 | ||
| 205 | // libsmctrl wrapper is in use (see comment on setup() constructor for | ||
| 206 | // detail). | ||
| 207 | if (shared_set_next_mask) | ||
| 208 | return (*shared_set_next_mask)(mask); | ||
| 209 | #endif | ||
| 172 | setup_sm_control_callback(); | 210 | setup_sm_control_callback(); |
| 173 | g_next_sm_mask = mask; | 211 | g_next_sm_mask = mask; |
| 174 | } | 212 | } |
| @@ -248,7 +286,7 @@ struct stream_sm_mask_v2 { | |||
| 248 | // (CUDA 9.0 behaves slightly different on this platform.) | 286 | // (CUDA 9.0 behaves slightly different on this platform.) |
| 249 | // @return 1 if detected, 0 if not, -cuda_err on error | 287 | // @return 1 if detected, 0 if not, -cuda_err on error |
| 250 | #if __aarch64__ | 288 | #if __aarch64__ |
| 251 | int detect_parker_soc() { | 289 | static int detect_parker_soc() { |
| 252 | int cap_major, cap_minor, err, dev_count; | 290 | int cap_major, cap_minor, err, dev_count; |
| 253 | if (err = cuDeviceGetCount(&dev_count)) | 291 | if (err = cuDeviceGetCount(&dev_count)) |
| 254 | return -err; | 292 | return -err; |
| @@ -272,7 +310,7 @@ int detect_parker_soc() { | |||
| 272 | } | 310 | } |
| 273 | #endif // __aarch64__ | 311 | #endif // __aarch64__ |
| 274 | 312 | ||
| 275 | // Should work for CUDA 8.0 through 12.6 | 313 | // Should work for CUDA 8.0 through 12.8 |
| 276 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in | 314 | // A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in |
| 277 | // our header | 315 | // our header |
| 278 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { | 316 | void libsmctrl_set_stream_mask(void* stream, uint64_t mask) { |
| @@ -417,7 +455,8 @@ void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) { | |||
| 417 | } | 455 | } |
| 418 | } | 456 | } |
| 419 | 457 | ||
| 420 | /* INFORMATIONAL FUNCTIONS */ | 458 | |
| 459 | /*** TPC and GPU Informational Functions ***/ | ||
| 421 | 460 | ||
| 422 | // Read an integer from a file in `/proc` | 461 | // Read an integer from a file in `/proc` |
| 423 | static int read_int_procfile(char* filename, uint64_t* out) { | 462 | static int read_int_procfile(char* filename, uint64_t* out) { |
| @@ -590,32 +629,98 @@ abort_cuda: | |||
| 590 | return EIO; | 629 | return EIO; |
| 591 | } | 630 | } |
| 592 | 631 | ||
| 632 | |||
| 633 | /*** Private functions for nvtaskset and building as a libcuda.so.1 wrapper ***/ | ||
| 634 | |||
| 635 | // Check if NVIDIA MPS is running, following the process that `strace` shows | ||
| 636 | // `nvidia-cuda-mps-control` to use. MPS is a prerequisite to co-running | ||
| 637 | // multiple GPU-using tasks without timeslicing. | ||
| 638 | bool libsmctrl_is_mps_running() { | ||
| 639 | char *mps_pipe_dir; | ||
| 640 | int mps_ctrl; | ||
| 641 | struct sockaddr_un mps_ctrl_addr; | ||
| 642 | mps_ctrl_addr.sun_family = AF_UNIX; | ||
| 643 | const int yes = 1; | ||
| 644 | |||
| 645 | if (!(mps_pipe_dir = getenv("CUDA_MPS_PIPE_DIRECTORY"))) | ||
| 646 | mps_pipe_dir = "/tmp/nvidia-mps"; | ||
| 647 | // Pipe names are limited to 108 characters long | ||
| 648 | snprintf(mps_ctrl_addr.sun_path, 108, "%s/control", mps_pipe_dir); | ||
| 649 | // This mirrors the process `nvidia-cuda-mps-control` uses to detect MPS | ||
| 650 | if ((mps_ctrl = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1) | ||
| 651 | return false; | ||
| 652 | if (setsockopt(mps_ctrl, SOL_SOCKET, SO_PASSCRED, &yes, sizeof(yes)) == -1) | ||
| 653 | return false; | ||
| 654 | if (connect(mps_ctrl, &mps_ctrl_addr, sizeof(struct sockaddr_un)) == -1) | ||
| 655 | return false; | ||
| 656 | close(mps_ctrl); | ||
| 657 | return true; | ||
| 658 | } | ||
| 659 | |||
| 660 | // A variant of strtoul with support for 128-bit integers | ||
| 661 | uint128_t strtou128(const char *nptr, char **endptr, int base) { | ||
| 662 | unsigned __int128 result = 0; | ||
| 663 | if (base != 16) | ||
| 664 | error(1, EINVAL, "strtou128 only supports base 16"); | ||
| 665 | // Skip a "0x" prefix. Safe due to early evaluation | ||
| 666 | if (*nptr == '0' && (*(nptr + 1) == 'x' || *(nptr + 1) == 'X')) | ||
| 667 | nptr += 2; | ||
| 668 | // Until hitting an invalid character | ||
| 669 | while (1) { | ||
| 670 | if (*nptr >= 'a' && *nptr <= 'f') | ||
| 671 | result = result << 4 | (*nptr - 'a' + 10); | ||
| 672 | else if (*nptr >= 'A' && *nptr <= 'F') | ||
| 673 | result = result << 4 | (*nptr - 'A' + 10); | ||
| 674 | else if (*nptr >= '0' && *nptr <= '9') | ||
| 675 | result = result << 4 | (*nptr - '0'); | ||
| 676 | else | ||
| 677 | break; | ||
| 678 | nptr++; | ||
| 679 | } | ||
| 680 | if (endptr) | ||
| 681 | *endptr = (char*)nptr; | ||
| 682 | return result; | ||
| 683 | } | ||
| 684 | |||
| 685 | #ifdef LIBSMCTRL_WRAPPER | ||
| 686 | // The CUDA runtime library uses dlopen() to load CUDA functions from | ||
| 687 | // libcuda.so.1. Since we replace that with our wrapper library, we need to | ||
| 688 | // also redirect any attempted opens of that shared object to the actual | ||
| 689 | // shared library, which is linked to by libcuda.so. | ||
| 690 | void *dlopen(const char *filename, int flags) { | ||
| 691 | if (filename && strcmp(filename, "libcuda.so") == 0) { | ||
| 692 | fprintf(stderr, "redirecting dlopen of %s to libcuda.so\n", filename); | ||
| 693 | // A GNU-only dlopen variant | ||
| 694 | return dlmopen(LM_ID_BASE, "libcuda.so", flags); | ||
| 695 | } else | ||
| 696 | return dlmopen(LM_ID_BASE, filename, flags); | ||
| 697 | } | ||
| 698 | |||
| 593 | // Allow setting a default mask via an environment variable | 699 | // Allow setting a default mask via an environment variable |
| 594 | // Also enables libsmctrl to be used on unmodified programs via setting: | 700 | // Also enables libsmctrl to be used on unmodified programs via setting: |
| 595 | // LD_PRELOAD=libsmctrl.so LIBSMCTRL_MASK=<your mask> ./my_program | 701 | // LD_LIBRARY_PATH=libsmctrl LIBSMCTRL_MASK=<your mask> ./my_program |
| 596 | // Where "<your mask>" is replaced with a disable mask, optionally prefixed | 702 | // Where "<your mask>" is replaced with a disable mask, optionally prefixed |
| 597 | // with a ~ to invert it (make it an enable mask). | 703 | // with a ~ to invert it (make it an enable mask). |
| 598 | __attribute__((constructor)) static void setup(void) { | 704 | __attribute__((constructor)) static void setup(void) { |
| 599 | char *end, *mask_str; | 705 | char *end, *mask_str; |
| 600 | // If dynamic changes are disabled (due to an error) this variable is | 706 | // If dynamic changes are disabled (due to an error) this variable is |
| 601 | // permanently used to store the supreme mask, rather than the SysV shared | 707 | // permanently used to store the supreme mask, rather than the shared |
| 602 | // memory segment. | 708 | // memory segment. |
| 603 | static uint64_t mask; | 709 | static uint128_t mask; |
| 604 | bool invert = false; | 710 | bool invert = false; |
| 605 | int shmid; | ||
| 606 | key_t shm_key; | ||
| 607 | 711 | ||
| 608 | mask_str = getenv("LIBSMCTRL_MASK"); | 712 | mask_str = getenv("LIBSMCTRL_MASK"); |
| 713 | |||
| 714 | // Assume no mask if unspecified | ||
| 609 | if (!mask_str) | 715 | if (!mask_str) |
| 610 | return; | 716 | mask_str = "0"; |
| 611 | 717 | ||
| 612 | if (*mask_str == '~') { | 718 | if (*mask_str == '~') { |
| 613 | invert = true; | 719 | invert = true; |
| 614 | mask_str++; | 720 | mask_str++; |
| 615 | } | 721 | } |
| 616 | 722 | ||
| 617 | // XXX: Doesn't support 128-bit masks | 723 | mask = strtou128(mask_str, &end, 16); |
| 618 | mask = strtoull(mask_str, &end, 0); | ||
| 619 | // Verify we were able to parse the whole string | 724 | // Verify we were able to parse the whole string |
| 620 | if (*end != '\0') | 725 | if (*end != '\0') |
| 621 | abort(1, EINVAL, "Unable to apply default mask"); | 726 | abort(1, EINVAL, "Unable to apply default mask"); |
| @@ -623,35 +728,64 @@ __attribute__((constructor)) static void setup(void) { | |||
| 623 | if (invert) | 728 | if (invert) |
| 624 | mask = ~mask; | 729 | mask = ~mask; |
| 625 | 730 | ||
| 731 | // Explictly set the number of channels (if unset), otherwise CUDA will only | ||
| 732 | // use two with MPS (see paper for why that causes problems) | ||
| 733 | if (setenv("CUDA_DEVICE_MAX_CONNECTIONS", "8", 0) == -1) | ||
| 734 | abort(1, EINVAL, "Unable to configure environment"); | ||
| 735 | |||
| 736 | // Warn if a mask was specified but MPS isn't running | ||
| 737 | if (mask && !libsmctrl_is_mps_running()) | ||
| 738 | fprintf(stderr, "libsmctrl-libcuda-wrapper: Warning: TPC mask set via LIBSMCTRL_MASK, but NVIDIA MPS is not running. CUDA programs will not co-run!\n"); | ||
| 739 | |||
| 626 | // Initialize CUDA and the interception callback | 740 | // Initialize CUDA and the interception callback |
| 627 | setup_sm_control_callback(); | 741 | setup_sm_control_callback(); |
| 628 | 742 | ||
| 629 | // TODO: Switch to memfd_create(); this leaks IPC objects | 743 | // Create shared memory region for the supreme mask such that nvtaskset |
| 630 | // Create a SysV IPC key (32 bits) to identify our shared memory region | 744 | // can read and modify it |
| 631 | // Use the pid as the top 16 bits, and "sm" as the bottom 16 | 745 | int fd = memfd_create("libsmctrl", MFD_CLOEXEC); |
| 632 | shm_key = getpid(); | 746 | if (fd == -1) { |
| 633 | shm_key <<= 16; | ||
| 634 | shm_key |= (int)'s' << 8 | (int)'m'; | ||
| 635 | // Obtain or create a 128-bit (16-byte) shared memory region | ||
| 636 | shmid = shmget(shm_key, 16, IPC_CREAT | 0600); | ||
| 637 | if (shmid == -1) { | ||
| 638 | abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled"); | 747 | abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled"); |
| 639 | g_supreme_sm_mask = &mask; | 748 | g_supreme_sm_mask = &mask; |
| 640 | return; | 749 | return; |
| 641 | } | 750 | } |
| 642 | // Open the shared memory region | 751 | if (ftruncate(fd, 16) == -1) { |
| 643 | g_supreme_sm_mask = shmat(shmid, NULL, 0); | 752 | abort(0, errno, "Unable to resize shared memory for dynamic partition changes. Dynamic changes disabled"); |
| 644 | if (g_supreme_sm_mask == (void*)-1) { | 753 | g_supreme_sm_mask = &mask; |
| 645 | abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled"); | 754 | return; |
| 755 | } | ||
| 756 | if ((g_supreme_sm_mask = mmap(NULL, 16, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)) == MAP_FAILED) { | ||
| 757 | abort(0, errno, "Unable to map shared memory for dynamic partition changes. Dynamic changes disabled"); | ||
| 646 | g_supreme_sm_mask = &mask; | 758 | g_supreme_sm_mask = &mask; |
| 647 | return; | 759 | return; |
| 648 | } | 760 | } |
| 649 | // XXX: This makes the region unopenable to everyone else. Switch to memfd! | ||
| 650 | // Mark the shared memory region for deletion (after we terminate) | ||
| 651 | if (shmctl(shmid, IPC_RMID, NULL) == -1) | ||
| 652 | abort(0, errno, "Unable to mark shared memory for dynamic partition changes for deletion on process termination. Will leak one page of memory."); | ||
| 653 | 761 | ||
| 654 | // Set the super-global mask which cannot be overwritten by any libsmctrl | 762 | // Set the super-global mask which cannot be overwritten by any libsmctrl |
| 655 | // API function. | 763 | // API function. |
| 656 | *g_supreme_sm_mask = mask; | 764 | *g_supreme_sm_mask = mask; |
| 657 | } | 765 | } |
| 766 | #elif defined(LIBSMCTRL_STATIC) | ||
| 767 | // If this library is statically built into a program, and the libcuda.so.1 | ||
| 768 | // wrapper is enabled, we force the staticlly linked version of the library | ||
| 769 | // to defer to the function implementations in the wrapper. | ||
| 770 | // | ||
| 771 | // Longer explanation: | ||
| 772 | // If the library has been dynamically linked into a program and the wrapper | ||
| 773 | // is in use, the loader will point both to the same set of symbols (since both | ||
| 774 | // will do a dynamic lookup at load-time, the global state at the top of this | ||
| 775 | // file uses global linkage, and will thus be in the dynamic symbol table, and | ||
| 776 | // each lookup will find the same copy.) | ||
| 777 | // Symbols from a staticlly linked library are not included in the dynamic | ||
| 778 | // symbol table, and thus can exist in duplicate of those in any shared | ||
| 779 | // library. This is a problem, since only one callback function, using one set | ||
| 780 | // of global variables can be registered with CUDA. We work around this by | ||
| 781 | // having our statically linked library use the functions from the wrapper or | ||
| 782 | // any shared library, if one such instance is loaded. | ||
| 783 | __attribute__((constructor)) static void setup(void) { | ||
| 784 | // dlsym can only view the dynamic symbol tables, and so these lookups will | ||
| 785 | // fail if neither the wrapper (libcuda.so.1) nor libsmctrl.so are loaded. | ||
| 786 | // (That indicates that we should the static library implementations.) | ||
| 787 | // These are a NOP on failure since they return NULL when not found. | ||
| 788 | shared_set_next_mask = dlsym(RTLD_DEFAULT, "libsmctrl_set_next_mask"); | ||
| 789 | shared_set_global_mask = dlsym(RTLD_DEFAULT, "libsmctrl_set_global_mask"); | ||
| 790 | } | ||
| 791 | #endif | ||
diff --git a/libsmctrl_test_gpc_info.c b/libsmctrl_test_gpc_info.c index 558b80a..afa0876 100644 --- a/libsmctrl_test_gpc_info.c +++ b/libsmctrl_test_gpc_info.c | |||
| @@ -38,6 +38,8 @@ int main(int argc, char** argv) { | |||
| 38 | gpu_id = 0; | 38 | gpu_id = 0; |
| 39 | // Tell CUDA to use PCI device id ordering (to match nvdebug) | 39 | // Tell CUDA to use PCI device id ordering (to match nvdebug) |
| 40 | putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID"); | 40 | putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID"); |
| 41 | // Allow CUDA to see all devices (to better match nvdebug) | ||
| 42 | unsetenv("CUDA_VISIBLE_DEVICES"); | ||
| 41 | // A CUDA context is required before reading the topology information | 43 | // A CUDA context is required before reading the topology information |
| 42 | if ((res = cuInit(0))) { | 44 | if ((res = cuInit(0))) { |
| 43 | const char* name; | 45 | const char* name; |
| @@ -45,7 +47,7 @@ int main(int argc, char** argv) { | |||
| 45 | fprintf(stderr, "%s: Unable to initialize CUDA, error %s\n", program_invocation_name, name); | 47 | fprintf(stderr, "%s: Unable to initialize CUDA, error %s\n", program_invocation_name, name); |
| 46 | return 1; | 48 | return 1; |
| 47 | } | 49 | } |
| 48 | if ((res = cuCtxCreate(&ctx, 0, 0))) { | 50 | if ((res = cuCtxCreate(&ctx, 0, gpu_id))) { |
| 49 | const char* name; | 51 | const char* name; |
| 50 | cuGetErrorName(res, &name); | 52 | cuGetErrorName(res, &name); |
| 51 | fprintf(stderr, "%s: Unable to create a CUDA context, error %s\n", program_invocation_name, name); | 53 | fprintf(stderr, "%s: Unable to create a CUDA context, error %s\n", program_invocation_name, name); |
diff --git a/libsmctrl_test_mask_shared.cu b/libsmctrl_test_mask_shared.cu index 3b7ebcd..8d2bd79 100644 --- a/libsmctrl_test_mask_shared.cu +++ b/libsmctrl_test_mask_shared.cu | |||
| @@ -18,9 +18,8 @@ __global__ void read_and_store_smid(uint8_t* smid_arr) { | |||
| 18 | smid_arr[blockIdx.x] = smid; | 18 | smid_arr[blockIdx.x] = smid; |
| 19 | } | 19 | } |
| 20 | 20 | ||
| 21 | // Assuming SMs continue to support a maximum of 2048 resident threads, six | 21 | // Need at least as many blocks as there are SMs on NVIDIA's biggest GPUs |
| 22 | // blocks of 1024 threads should span at least three SMs without partitioning | 22 | #define NUM_BLOCKS 142 |
| 23 | #define NUM_BLOCKS 142 //6 | ||
| 24 | 23 | ||
| 25 | static int sort_asc(const void* a, const void* b) { | 24 | static int sort_asc(const void* a, const void* b) { |
| 26 | return *(uint8_t*)a - *(uint8_t*)b; | 25 | return *(uint8_t*)a - *(uint8_t*)b; |
| @@ -83,8 +82,11 @@ int test_constrained_size_and_location(enum partitioning_type part_type) { | |||
| 83 | // Apply partitioning to enable only the first TPC of each 32-bit block | 82 | // Apply partitioning to enable only the first TPC of each 32-bit block |
| 84 | switch (part_type) { | 83 | switch (part_type) { |
| 85 | case PARTITION_SUPREME: | 84 | case PARTITION_SUPREME: |
| 86 | printf("%s: Please set mask to '0x%016lx%016lx' for PID %d using the control deamon and press any key to continue...\n", program_invocation_name, (uint64_t)(mask >> 64), (uint64_t)mask, getpid()); | 85 | char cmd[80]; |
| 87 | fgetc(stdin); | 86 | // We must invert the mask before passing it to nvtaskset, since |
| 87 | // nvtaskset takes an enable mask (as with the taskset command) | ||
| 88 | snprintf(cmd, 80, "./nvtaskset -p 0x%.0lx%016lx %d > /dev/null", ~(uint64_t)(mask >> 64), ~(uint64_t)mask, getpid()); | ||
| 89 | system(cmd); | ||
| 88 | break; | 90 | break; |
| 89 | case PARTITION_GLOBAL: | 91 | case PARTITION_GLOBAL: |
| 90 | libsmctrl_set_global_mask(mask); | 92 | libsmctrl_set_global_mask(mask); |
| @@ -120,10 +122,9 @@ int test_constrained_size_and_location(enum partitioning_type part_type) { | |||
| 120 | uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS); // Sorts too | 122 | uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS); // Sorts too |
| 121 | if (uniq_partitioned > sms_per_tpc) { | 123 | if (uniq_partitioned > sms_per_tpc) { |
| 122 | printf("%s: ***Test failure.***\n" | 124 | printf("%s: ***Test failure.***\n" |
| 123 | "%s: Reason: With TPC mask set to " | 125 | "%s: Reason: With a partition of only one TPC, the test kernel " |
| 124 | "constrain all kernels to a single TPC, a kernel of %d blocks of " | 126 | "of %d blocks of 1024 threads ran on %d SMs (at most %d---one " |
| 125 | "1024 threads was launched and found to run on %d SMs (at most %d---" | 127 | "TPC---expected).\n", program_invocation_name, program_invocation_name, NUM_BLOCKS, uniq_partitioned, sms_per_tpc); |
| 126 | "one TPC---expected).\n", program_invocation_name, program_invocation_name, NUM_BLOCKS, uniq_partitioned, sms_per_tpc); | ||
| 127 | return 1; | 128 | return 1; |
| 128 | } | 129 | } |
| 129 | 130 | ||
| @@ -131,18 +132,16 @@ int test_constrained_size_and_location(enum partitioning_type part_type) { | |||
| 131 | if (smids_partitioned_h[NUM_BLOCKS - 1] > (enabled_tpc * sms_per_tpc) + sms_per_tpc - 1 || | 132 | if (smids_partitioned_h[NUM_BLOCKS - 1] > (enabled_tpc * sms_per_tpc) + sms_per_tpc - 1 || |
| 132 | smids_partitioned_h[NUM_BLOCKS - 1] < (enabled_tpc * sms_per_tpc)) { | 133 | smids_partitioned_h[NUM_BLOCKS - 1] < (enabled_tpc * sms_per_tpc)) { |
| 133 | printf("%s: ***Test failure.***\n" | 134 | printf("%s: ***Test failure.***\n" |
| 134 | "%s: Reason: With TPC mask set to " | 135 | "%s: Reason: With a partition of only TPC %d, the test kernel " |
| 135 | "constrain all kernels to TPC %d, a kernel was run and found " | 136 | "ran on SM IDs as high as %d and as low as %d (range of %d to %d " |
| 136 | "to run on an SM IDs: as high as %d and as low as %d (range of %d to %d expected).\n", | 137 | "expected).\n", program_invocation_name, program_invocation_name, enabled_tpc, smids_partitioned_h[NUM_BLOCKS - 1], smids_partitioned_h[0], enabled_tpc * sms_per_tpc + sms_per_tpc - 1, enabled_tpc * sms_per_tpc); |
| 137 | program_invocation_name, program_invocation_name, enabled_tpc, smids_partitioned_h[NUM_BLOCKS - 1], smids_partitioned_h[0], enabled_tpc * sms_per_tpc + sms_per_tpc - 1, enabled_tpc * sms_per_tpc); | ||
| 138 | return 1; | 138 | return 1; |
| 139 | } | 139 | } |
| 140 | 140 | ||
| 141 | // Div by 32 via a shift | 141 | // Div by 32 via a shift |
| 142 | asprintf(&reason[enabled_tpc >> 5], | 142 | asprintf(&reason[enabled_tpc >> 5], |
| 143 | "With a partition enabled which " | 143 | "With a partition of only TPC %d, the test kernel used only %d " |
| 144 | "contained only TPC ID %d, the test kernel was found to use only %d " | 144 | "SMs (%d without), and all had IDs between %d and %d (were contained" |
| 145 | "SMs (%d without), and all SMs in-use had IDs between %d and %d (were contained" | ||
| 146 | " in TPC %d).", enabled_tpc, uniq_partitioned, uniq_native, smids_partitioned_h[0], smids_partitioned_h[NUM_BLOCKS - 1], enabled_tpc); | 145 | " in TPC %d).", enabled_tpc, uniq_partitioned, uniq_native, smids_partitioned_h[0], smids_partitioned_h[NUM_BLOCKS - 1], enabled_tpc); |
| 147 | } | 146 | } |
| 148 | 147 | ||
diff --git a/nvtaskset.c b/nvtaskset.c index 74f88d7..4901cbe 100644 --- a/nvtaskset.c +++ b/nvtaskset.c | |||
| @@ -1,210 +1,384 @@ | |||
| 1 | // Copyright 2025 Joshua Bakita | 1 | // Copyright 2025 Joshua Bakita |
| 2 | // taskset-like utility for the GPU | 2 | // Show or change the GPU core affinity for a CUDA process |
| 3 | // taskset-like utility for NVIDIA GPUs | ||
| 3 | #define _GNU_SOURCE // For program_invocation_name | 4 | #define _GNU_SOURCE // For program_invocation_name |
| 4 | #include <argp.h> | 5 | #include <argp.h> |
| 6 | #include <dirent.h> | ||
| 5 | #include <errno.h> | 7 | #include <errno.h> |
| 6 | #include <error.h> | 8 | #include <error.h> |
| 9 | #include <fcntl.h> | ||
| 10 | #include <stdbool.h> | ||
| 7 | #include <stdint.h> | 11 | #include <stdint.h> |
| 8 | #include <stdio.h> | 12 | #include <stdio.h> |
| 9 | #include <stdlib.h> | 13 | #include <stdlib.h> |
| 10 | #include <string.h> | 14 | #include <string.h> |
| 11 | #include <sys/ipc.h> | 15 | #include <sys/mman.h> |
| 12 | #include <sys/shm.h> | ||
| 13 | #include <sys/types.h> | ||
| 14 | #include <unistd.h> | 16 | #include <unistd.h> |
| 15 | 17 | ||
| 16 | #include <cuda.h> // To help with getting GPC info | 18 | #include <cuda.h> // To help with getting GPC info |
| 17 | 19 | ||
| 18 | #include "libsmctrl.h" | 20 | #include "libsmctrl.h" |
| 19 | 21 | ||
| 20 | const char* maintainer = "<jbakita@cs.unc.edu>"; | 22 | #define LINK_NAME "/memfd:libsmctrl" |
| 21 | const char* version = "nvtaskset 2025.03"; | 23 | |
| 22 | const char* desc = "taskset-like utility for NVIDIA GPUs."; | 24 | // TODO: Write automated tests: |
| 25 | // - Change region of non-existent PID | ||
| 26 | // - Change region of permission denied PID | ||
| 27 | // - Change region of non-GPU PID | ||
| 28 | // - Change GPC list | ||
| 29 | // - Change TPC list | ||
| 30 | // - Change TPC mask | ||
| 31 | // - Start TPC mask | ||
| 32 | // - Start TPC list | ||
| 33 | // - Start GPC list | ||
| 34 | // - Start with subargument containing - | ||
| 35 | // - Query GPC list | ||
| 36 | // - Query TPC list | ||
| 37 | // - Query TPC mask | ||
| 38 | // - Set GPC list w/ non-existant GPC | ||
| 39 | // - Set TPC list w/ non-existant TPC | ||
| 40 | |||
| 41 | // Private symbols from libsmctrl | ||
| 42 | extern bool libsmctrl_is_mps_running(); | ||
| 43 | extern uint128_t strtou128(const char *nptr, char **endptr, int base); | ||
| 44 | |||
| 45 | const char *argp_program_bug_address = "<jbakita@cs.unc.edu>"; | ||
| 46 | const char *argp_program_version = "nvtaskset 2025.06"; | ||
| 47 | const char desc[] = "Show or change the GPU core affinity for a CUDA process\v" | ||
| 48 | "Warning: When using GPC lists, this tool currently " | ||
| 49 | "derives TPC to GPC mappings from the first NVIDIA GPU in " | ||
| 50 | "the system (by PCI bus ID) device. To use the mappings " | ||
| 51 | "for a different device, use the `libsmctrl_test_get_info` " | ||
| 52 | "tool to get the bitmask of TPCs associated with each GPC, " | ||
| 53 | "OR them, and then set that bitmask via this tool. Better " | ||
| 54 | "multi-GPU support is intended for a future release.\n\n" | ||
| 55 | "Inspired by the Linux taskset utility."; | ||
| 56 | const char args_doc[] = "[mask | list] [pid | cmd [args...]]"; | ||
| 23 | 57 | ||
| 24 | const struct argp_option opts[] = { | 58 | const struct argp_option opts[] = { |
| 59 | {"gpc-list", 'g', NULL, 0, "Specify partition as a list of GPCs"}, | ||
| 60 | {"tpc-list", 't', NULL, 0, "Specify partition as a list of TPCs"}, | ||
| 61 | {"pid", 'p', NULL, 0, "Operate on an existing PID"}, | ||
| 25 | {0} | 62 | {0} |
| 26 | }; | 63 | }; |
| 27 | 64 | ||
| 28 | unsigned __int128 strtou128(const char *nptr, char **endptr, int base) { | 65 | // Create a CUDA context and query the associated GPC to TPC mappings |
| 29 | unsigned __int128 result = 0; | 66 | // Based off logic in libsmctrl_test_gpc_info |
| 30 | if (base != 16) | ||
| 31 | error(1, EINVAL, "Internal error"); | ||
| 32 | // Skip a "0x" prefix. Safe due to early evaluation | ||
| 33 | if (*nptr == '0' && (*(nptr + 1) == 'x' || *(nptr + 1) == 'X')) | ||
| 34 | nptr += 2; | ||
| 35 | // Until hitting an invalid character | ||
| 36 | while (1) { | ||
| 37 | if (*nptr >= 'a' && *nptr <= 'f') | ||
| 38 | result = result << 4 | (*nptr - 'a' + 10); | ||
| 39 | else if (*nptr >= 'A' && *nptr <= 'F') | ||
| 40 | result = result << 4 | (*nptr - 'A' + 10); | ||
| 41 | else if (*nptr >= '0' && *nptr <= '9') | ||
| 42 | result = result << 4 | (*nptr - '0'); | ||
| 43 | else | ||
| 44 | break; | ||
| 45 | nptr++; | ||
| 46 | } | ||
| 47 | if (endptr) | ||
| 48 | *endptr = (char*)nptr; | ||
| 49 | return result; | ||
| 50 | } | ||
| 51 | |||
| 52 | void libsmctrl_get_gpc_info_ext_easy(uint32_t* num_gpcs, uint128_t** masks, int gpu_id) { | 67 | void libsmctrl_get_gpc_info_ext_easy(uint32_t* num_gpcs, uint128_t** masks, int gpu_id) { |
| 53 | int res; | 68 | int res; |
| 54 | CUcontext ctx; | 69 | CUcontext ctx; |
| 55 | // XXX: Copied from libsmctrl_test_gpc_info | 70 | char *old_order = NULL; |
| 56 | // Tell CUDA to use PCI device id ordering (to match nvdebug) | 71 | // Tell CUDA to use PCI device id ordering (to match nvdebug) |
| 57 | putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID"); | 72 | putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID"); |
| 58 | // A CUDA context is required before reading the topology information | 73 | // Allow CUDA to see all devices (to better match nvdebug) |
| 59 | if ((res = cuInit(0))) { | 74 | if (getenv("CUDA_VISIBLE_DEVICES")) { |
| 60 | const char* name; | 75 | if (!(old_order = strdup(getenv("CUDA_VISIBLE_DEVICES")))) |
| 61 | cuGetErrorName(res, &name); | 76 | error(1, errno, "Unable to allocate environment string"); |
| 62 | fprintf(stderr, "%s: Unable to initialize CUDA, error %s\n", program_invocation_name, name); | 77 | unsetenv("CUDA_VISIBLE_DEVICES"); |
| 63 | exit(1); | 78 | } |
| 64 | } | 79 | // A CUDA context is required before reading the topology information |
| 65 | if ((res = cuCtxCreate(&ctx, 0, 0))) { | 80 | if ((res = cuInit(0))) { |
| 66 | const char* name; | 81 | const char* name; |
| 67 | cuGetErrorName(res, &name); | 82 | cuGetErrorName(res, &name); |
| 68 | fprintf(stderr, "%s: Unable to create a CUDA context, error %s\n", program_invocation_name, name); | 83 | error(1, 0, "Unable to create a initialize CUDA, error %s\n", name); |
| 69 | exit(1); | 84 | } |
| 70 | } | 85 | if ((res = cuCtxCreate(&ctx, 0, gpu_id))) { |
| 71 | // Pull topology information from libsmctrl | 86 | const char* name; |
| 72 | if ((res = libsmctrl_get_gpc_info_ext(num_gpcs, masks, gpu_id)) != 0) { | 87 | cuGetErrorName(res, &name); |
| 73 | error(0, res, "libsmctrl_get_gpc_info() failed"); | 88 | error(1, 0, "Unable to create a CUDA context, error %s\n", name); |
| 74 | if (res == ENOENT) | 89 | } |
| 75 | fprintf(stderr, "%s: Is the nvdebug kernel module loaded?\n", program_invocation_name); | 90 | // Pull topology information from libsmctrl |
| 76 | if (res == EIO) | 91 | if ((res = libsmctrl_get_gpc_info_ext(num_gpcs, masks, gpu_id)) != 0) { |
| 77 | fprintf(stderr, "%s: Is the GPU powered on, i.e., is there an active context?\n", program_invocation_name); | 92 | error(0, res, "libsmctrl_get_gpc_info() failed"); |
| 78 | exit(1); | 93 | if (res == ENOENT) |
| 79 | } | 94 | fprintf(stderr, "%s: Is the nvdebug kernel module loaded?\n", program_invocation_name); |
| 80 | // Not copied | 95 | if (res == EIO) |
| 96 | fprintf(stderr, "%s: Is the GPU powered on, i.e., is there an active context?\n", program_invocation_name); | ||
| 97 | exit(1); | ||
| 98 | } | ||
| 99 | // Restore the environment (in case we exec() later) | ||
| 81 | unsetenv("CUDA_DEVICE_ORDER"); | 100 | unsetenv("CUDA_DEVICE_ORDER"); |
| 101 | if (old_order) { | ||
| 102 | setenv("CUDA_VISIBLE_DEVICES", old_order, 1); | ||
| 103 | free(old_order); | ||
| 104 | } | ||
| 82 | } | 105 | } |
| 83 | 106 | ||
| 84 | int main(int argc, char **argv) { | 107 | int parse_list(bool use_gpcs, char* list, uint128_t *mask_out) { |
| 85 | if (argc < 3) { | 108 | // We support the same ranges as taskset, e.g., X,Y,Z and X,Y-Z |
| 86 | fprintf(stderr, "Usage: %s -p <hex mask> <pid>\n", argv[0]); | 109 | uint32_t num_xpcs = 0; // Either TPC or GPC count, i.e., "X"PC |
| 87 | fprintf(stderr, " %s <hex mask> <command> <argument...>\n", argv[0]); | 110 | uint128_t* masks = NULL; |
| 88 | fprintf(stderr, " %s --gpc-list <gpc list> <command> <argument...>\n", argv[0]); | 111 | // TODO: Allow specifying GPU ID, rather than assuming 0! |
| 89 | fprintf(stderr, " <hex mask> has a bit set for each TPC to be enabled\n"); | 112 | if (use_gpcs) |
| 90 | return 1; | 113 | libsmctrl_get_gpc_info_ext_easy(&num_xpcs, &masks, 0); |
| 91 | } | 114 | else |
| 92 | // TODO: Use a proper argument parser | 115 | libsmctrl_get_tpc_info_cuda(&num_xpcs, 0); |
| 93 | if (strcmp("-p", argv[1]) == 0) { // Setting mask on running task | 116 | uint128_t mask = 0; |
| 94 | char *end; | 117 | int range_start_xpc = -1; |
| 95 | pid_t target_pid = strtoul(argv[2], &end, 10); | 118 | char* start = list; |
| 96 | // strtoul stores a pointer to the first invalid character in `end` | 119 | int len = strlen(list); |
| 97 | if (*end != '\0') { | 120 | // Convert comma-seperated GPC/TPC list into a mask |
| 98 | fprintf(stderr, "Invalid character \"%c\" in PID argument.\n", *end); | 121 | for (int i = 0; i < len + 1; i++) { |
| 99 | return 1; | 122 | if (list[i] == ',' || list[i] == '\0') { |
| 123 | list[i] = '\0'; | ||
| 124 | int xpc = atoi(start); | ||
| 125 | if (xpc > num_xpcs - 1) | ||
| 126 | error(1, EINVAL, "%s is not a valid %s ID", start, use_gpcs ? "GPC" : "TPC"); | ||
| 127 | // Handle ranges | ||
| 128 | if (range_start_xpc != -1) { | ||
| 129 | if (range_start_xpc >= xpc) | ||
| 130 | error(1, EINVAL, "Malformed %s range", use_gpcs ? "GPC" : "TPC"); | ||
| 131 | while (range_start_xpc <= xpc) { | ||
| 132 | if (use_gpcs) | ||
| 133 | mask |= masks[range_start_xpc]; | ||
| 134 | else | ||
| 135 | mask |= (uint128_t)1 << range_start_xpc; | ||
| 136 | range_start_xpc++; | ||
| 137 | } | ||
| 138 | range_start_xpc = -1; | ||
| 139 | } else { | ||
| 140 | if (use_gpcs) | ||
| 141 | mask |= masks[xpc]; | ||
| 142 | else | ||
| 143 | mask |= (uint128_t)1 << xpc; | ||
| 144 | } | ||
| 145 | start = list + i + 1; | ||
| 100 | } | 146 | } |
| 101 | unsigned __int128 mask = strtou128(argv[3], &end, 16); | 147 | // Range start |
| 102 | if (*end != '\0') { | 148 | if (list[i] == '-') { |
| 103 | fprintf(stderr, "Invalid character \"%c\" in mask argument.\n", *end); | 149 | list[i] = '\0'; |
| 104 | return 1; | 150 | range_start_xpc = atoi(start); |
| 151 | start = list + i + 1; | ||
| 105 | } | 152 | } |
| 106 | // The shared memory lookup key is the lower 16-bits of the PID | "sm" | 153 | } |
| 107 | key_t shm_key = target_pid << 16 | (int)'s' << 8 | (int) 'm'; | 154 | *mask_out = mask; |
| 108 | // Get a handle to the 128-bit shared memory region | 155 | return 0; |
| 109 | int shmid = shmget(shm_key, 16, 0); | 156 | } |
| 110 | if (shmid == -1) | 157 | |
| 111 | error(1, errno, "Unable to find control region for PID %d", target_pid); | 158 | // Always returns a valid string |
| 112 | // Open the shared memory region | 159 | char* compose_list(uint128_t mask) { |
| 113 | unsigned __int128 *supreme_mask = shmat(shmid, NULL, 0); | 160 | // List will always be shorter than every TPC, comma-seperated |
| 114 | if (supreme_mask == (void*)-1) | 161 | // 128 TPCs, with 10 1-char, 90 2-char, 28 3-char, 127 commas, and 1 null |
| 115 | error(1, errno, "Unable to open control region for PID %d", target_pid); | 162 | static char list[10 + 90*2 + 28*3 + 128]; |
| 116 | // Write the requested mask into the shared memory region | 163 | char* tail = list; |
| 117 | *supreme_mask = mask; | 164 | int last_enabled = -2; |
| 118 | } else { // Starting a new task with a mask | 165 | bool in_range; |
| 119 | // TODO: Check other locations for nvidia-cuda-mps-control if its not on the path | 166 | for (int i = 0; i < 128; i++) { |
| 120 | // TODO: Use dup2() to redirect MPS startup messages | 167 | bool enabled = (mask >> i) & 1; |
| 121 | int ret = system("echo -n | nvidia-cuda-mps-control"); | 168 | if (in_range) { |
| 122 | if (ret == -1) | 169 | if (enabled) { |
| 123 | error(1, errno, "Unable to run subshell to check MPS status"); | 170 | last_enabled = i; |
| 124 | if (ret != 0) { // Control deamon not yet started | 171 | } else { |
| 125 | fprintf(stderr, "nvtaskset: MPS control deamon does not appear to be running. Automatically starting...\n"); | 172 | tail += sprintf(tail, "%d,", last_enabled); |
| 126 | ret = system("nvidia-cuda-mps-control -d"); | 173 | in_range = false; |
| 127 | if (ret == -1) | ||
| 128 | error(1, errno, "Unable to run subshell to start MPS"); | ||
| 129 | if (ret == 1) { | ||
| 130 | fprintf(stderr, "nvtaskset: Error starting MPS control deamon. Terminating...\n"); | ||
| 131 | return 1; | ||
| 132 | } | 174 | } |
| 133 | fprintf(stderr, "nvtaskset: Done. Use \"echo quit | nvidia-cuda-mps-control\" to terminate it later as desired.\n"); | 175 | continue; |
| 176 | } | ||
| 177 | if (enabled) { | ||
| 178 | if (last_enabled == i - 1) { | ||
| 179 | in_range = true; | ||
| 180 | tail += sprintf(tail, "-"); | ||
| 181 | } else { | ||
| 182 | tail += sprintf(tail, "%d", i); | ||
| 183 | } | ||
| 184 | last_enabled = i; | ||
| 185 | } else { | ||
| 186 | if (last_enabled == i - 1) { | ||
| 187 | tail += sprintf(tail, ","); | ||
| 188 | } | ||
| 189 | } | ||
| 190 | } | ||
| 191 | // Strip trailing comma | ||
| 192 | if (*(tail - 1) == ',') | ||
| 193 | *(tail - 1) = '\0'; | ||
| 194 | return list; | ||
| 195 | } | ||
| 196 | |||
| 197 | // Always returns a valid string | ||
| 198 | // (Terminates the program on error) | ||
| 199 | char* compose_gpc_list(uint128_t mask) { | ||
| 200 | uint32_t num_gpcs = 0; | ||
| 201 | uint128_t* masks = NULL; | ||
| 202 | libsmctrl_get_gpc_info_ext_easy(&num_gpcs, &masks, 0); | ||
| 203 | uint128_t gpc_mask = 0; | ||
| 204 | // Try to find correspondence between a list of TPCs and GPCs | ||
| 205 | for (int gpc = 0; gpc < num_gpcs; gpc++) { | ||
| 206 | if ((masks[gpc] & mask) == masks[gpc]) { | ||
| 207 | gpc_mask |= 1 << gpc; | ||
| 208 | mask &= ~masks[gpc]; | ||
| 134 | } | 209 | } |
| 135 | // Tell loader to initialize libsmctrl.so first | 210 | } |
| 136 | // TODO: Append, rather than overwrite LD_PRELOAD | 211 | if (mask) |
| 137 | setenv("LD_PRELOAD", "libsmctrl.so", 1); | 212 | error(1, EINVAL, "Unable to interpret affinity as GPC list; try -t instead of -g"); |
| 138 | // Explictly set the number of channels, otherwise CUDA will only use two | 213 | return compose_list(gpc_mask); |
| 139 | // (see paper for why that causes problems) | 214 | } |
| 140 | setenv("CUDA_DEVICE_MAX_CONNECTIONS", "8", 1); | 215 | |
| 141 | // Check if a mask, or a list of GPCs is being provided | 216 | |
| 142 | if (strcmp(argv[1], "--gpc-list") == 0) { | 217 | uint128_t* get_mask_hndl(pid_t target_pid) { |
| 143 | // TODO: Support the full syntax that taskset supports | 218 | char fd_path[277]; |
| 144 | // We just support X,Y,Z for now | 219 | int fd; |
| 145 | uint32_t num_gpcs = 0; | 220 | uint128_t *mask_hndl; |
| 146 | uint128_t* masks = NULL; | 221 | DIR *dp; |
| 147 | // TODO: Allow specifying GPU ID, rather than assuming 0! | 222 | struct dirent *entry; |
| 148 | libsmctrl_get_gpc_info_ext_easy(&num_gpcs, &masks, 0); | 223 | // Search for the file descriptor which represents the libsmctrl control |
| 149 | uint128_t mask = 0; | 224 | // region. |
| 150 | int range_start_gpc = -1; | 225 | snprintf(fd_path, 277, "/proc/%d/fd/", target_pid); |
| 151 | char* start = argv[2]; | 226 | if (!(dp = opendir(fd_path))) { |
| 152 | int len = strlen(argv[2]); | 227 | if (errno == ENOENT) |
| 153 | // TODO: Handle invalid input cleanly. | 228 | error(1, 0, "Unable to find PID %d.", target_pid); |
| 154 | // Convert comma-seperated GPC list into a mask | 229 | else |
| 155 | for (int i = 0; i < len + 1; i++) { | 230 | error(1, errno, "Unable to access PID %d", target_pid); |
| 156 | if (argv[2][i] == ',' || argv[2][i] == '\0') { | 231 | } |
| 157 | argv[2][i] = '\0'; | 232 | while (entry = readdir(dp)) { |
| 158 | int gpc = atoi(start); | 233 | char link[sizeof(LINK_NAME)]; |
| 159 | if (gpc > num_gpcs - 1) { | 234 | snprintf(fd_path, 277, "/proc/%d/fd/%s", target_pid, entry->d_name); |
| 160 | fprintf(stderr, "Invalid GPC ID '%s'!\n", start); | 235 | readlink(fd_path, link, sizeof(LINK_NAME)); |
| 236 | if (strncmp(LINK_NAME, link, sizeof(LINK_NAME) - 1) == 0) | ||
| 237 | break; | ||
| 238 | } | ||
| 239 | closedir(dp); | ||
| 240 | if (!entry) | ||
| 241 | error(1, 0, "Unable to find libsmctrl-wrapper control region for PID %d.", target_pid); | ||
| 242 | // Access the shared memory region for libsmctrl control. | ||
| 243 | if ((fd = open(fd_path, O_RDWR)) == -1) | ||
| 244 | error(1, errno, "Unable to open libsmctrl-wrapper control file %s", fd_path); | ||
| 245 | mask_hndl = mmap(NULL, 16, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | ||
| 246 | if (mask_hndl == MAP_FAILED) | ||
| 247 | error(1, errno, "Unable to memory-map libsmctrl-wrapper control file %s", fd_path); | ||
| 248 | close(fd); | ||
| 249 | return mask_hndl; | ||
| 250 | } | ||
| 251 | |||
| 252 | static error_t arg_parser(int key, char* arg, struct argp_state *state){ | ||
| 253 | static bool is_cmd = true; | ||
| 254 | static bool is_query = false; | ||
| 255 | static bool is_list = false; | ||
| 256 | static bool use_gpcs = false; | ||
| 257 | static uint128_t mask = 0; | ||
| 258 | static pid_t target_pid = 0; | ||
| 259 | static char **sub_argv = NULL; | ||
| 260 | char *end; | ||
| 261 | // Handle what to do in case of each option | ||
| 262 | switch (key) { | ||
| 263 | case 'g': | ||
| 264 | if (is_list) | ||
| 265 | argp_error(state, "Only one of -g and -t may be specified.\n"); | ||
| 266 | use_gpcs = true; | ||
| 267 | is_list = true; | ||
| 268 | break; | ||
| 269 | case 't': | ||
| 270 | if (is_list) | ||
| 271 | argp_error(state, "Only one of -g and -t may be specified.\n"); | ||
| 272 | is_list = true; | ||
| 273 | break; | ||
| 274 | case 'p': | ||
| 275 | is_cmd = false; | ||
| 276 | break; | ||
| 277 | case ARGP_KEY_ARG: | ||
| 278 | // Options: | ||
| 279 | // 1. -p and one argument -> Query mask for PID | ||
| 280 | // 2. -p and two arguments -> Set mask for PID | ||
| 281 | // 3. No -p and at least one argument -> Set mask and launch command | ||
| 282 | // (otherwise: invalid) | ||
| 283 | if (state->arg_num == 0 && !is_cmd && state->argc - state->next == 0) | ||
| 284 | is_query = true; | ||
| 285 | // Handle invalid and valid query cases | ||
| 286 | if (is_query) { | ||
| 287 | if (state->arg_num == 0) { | ||
| 288 | target_pid = strtoul(arg, &end, 10); | ||
| 289 | if (*end != '\0') | ||
| 290 | argp_error(state, "Invalid character \"%c\" in PID argument.\n", *end); | ||
| 291 | break; | ||
| 292 | } else | ||
| 293 | return ARGP_ERR_UNKNOWN; | ||
| 294 | } | ||
| 295 | // Handle non-query cases | ||
| 296 | if (state->arg_num == 0 && state->argc - state->next != 0) { | ||
| 297 | if (is_list) { | ||
| 298 | parse_list(use_gpcs, arg, &mask); | ||
| 299 | } else { | ||
| 300 | // strtoul stores a pointer to the first invalid character in `end` | ||
| 301 | mask = strtou128(arg, &end, 16); | ||
| 302 | if (*end != '\0') | ||
| 303 | argp_error(state, "Invalid character \"%c\" in mask argument.\n", *end); | ||
| 304 | } | ||
| 305 | } else if (state->arg_num == 1 && !is_cmd) { | ||
| 306 | target_pid = strtoul(arg, &end, 10); | ||
| 307 | if (*end != '\0') | ||
| 308 | argp_error(state, "Invalid character \"%c\" in PID argument.\n", *end); | ||
| 309 | } else | ||
| 310 | return ARGP_ERR_UNKNOWN; | ||
| 311 | break; | ||
| 312 | case ARGP_KEY_ARGS: | ||
| 313 | if (!is_cmd) | ||
| 314 | return ARGP_ERR_UNKNOWN; | ||
| 315 | sub_argv = state->argv + state->next; | ||
| 316 | break; | ||
| 317 | case ARGP_KEY_END: | ||
| 318 | if (is_query && state->arg_num < 1) | ||
| 319 | argp_usage(state); | ||
| 320 | else if (!is_query && state->arg_num < 2) | ||
| 321 | argp_usage(state); | ||
| 322 | break; | ||
| 323 | case ARGP_KEY_FINI: | ||
| 324 | if (is_query) { | ||
| 325 | // query PID | ||
| 326 | uint128_t* mask_hndl = get_mask_hndl(target_pid); | ||
| 327 | uint128_t enable_mask = ~*mask_hndl; | ||
| 328 | if (use_gpcs & is_list) | ||
| 329 | printf("PID %d's current GPC affinity list: %s\n", target_pid, compose_gpc_list(enable_mask)); | ||
| 330 | else if (use_gpcs & !is_list) | ||
| 331 | argp_error(state, "Unsupported to print query as a GPC mask.\n"); | ||
| 332 | else if (is_list) | ||
| 333 | printf("PID %d's current TPC affinity list: %s\n", target_pid, compose_list(enable_mask)); | ||
| 334 | else | ||
| 335 | printf("PID %d's current TPC affinity mask: 0x%.0lx%016lx\n", target_pid, (uint64_t)(enable_mask >> 64), (uint64_t)enable_mask); | ||
| 336 | } else if (is_cmd) { | ||
| 337 | // start MPS (as needed) | ||
| 338 | if (!libsmctrl_is_mps_running()) { | ||
| 339 | fprintf(stderr, "nvtaskset: MPS control deamon does not appear to be running. Automatically starting...\n"); | ||
| 340 | int ret = system("nvidia-cuda-mps-control -d"); | ||
| 341 | if (ret == -1) | ||
| 342 | error(1, errno, "Unable to run subshell to start MPS"); | ||
| 343 | if (ret == 1) { | ||
| 344 | fprintf(stderr, "nvtaskset: Error starting MPS control deamon. Terminating...\n"); | ||
| 161 | return 1; | 345 | return 1; |
| 162 | } | 346 | } |
| 163 | // Handle ranges | 347 | fprintf(stderr, "nvtaskset: Done. Use \"echo quit | nvidia-cuda-mps-control\" to terminate it later as desired.\n"); |
| 164 | if (range_start_gpc != -1) { | ||
| 165 | if (range_start_gpc >= gpc) { | ||
| 166 | fprintf(stderr, "Invalid GPC range!\n"); | ||
| 167 | return 1; | ||
| 168 | } | ||
| 169 | while (range_start_gpc <= gpc) { | ||
| 170 | //printf("gpc %i\n", range_start_gpc); | ||
| 171 | mask |= masks[range_start_gpc]; | ||
| 172 | range_start_gpc++; | ||
| 173 | } | ||
| 174 | range_start_gpc = -1; | ||
| 175 | } else { | ||
| 176 | //printf("gpc %i\n", gpc); | ||
| 177 | mask |= masks[gpc]; | ||
| 178 | } | ||
| 179 | start = argv[2] + i + 1; | ||
| 180 | } | 348 | } |
| 181 | // Range start | 349 | // launch subprocess |
| 182 | if (argv[2][i] == '-') { | 350 | // Convert to string, prefix with ~, and set env var |
| 183 | argv[2][i] = '\0'; | 351 | char mask_str[32+3+1]; // 32 hexits, "~0x", and '\0' |
| 184 | range_start_gpc = atoi(start); | 352 | snprintf(mask_str, 36, "~0x%.0lx%016lx", (uint64_t)(mask >> 64), (uint64_t)mask); |
| 185 | start = argv[2] + i + 1; | 353 | setenv("LIBSMCTRL_MASK", mask_str, 1); |
| 354 | // Start task | ||
| 355 | execvp(sub_argv[0], sub_argv); | ||
| 356 | error(1, errno, "Unable to launch task '%s'", sub_argv[0]); | ||
| 357 | } else { | ||
| 358 | if (!libsmctrl_is_mps_running()) | ||
| 359 | printf("Warning: NVIDIA MPS is not running. CUDA programs will not co-run! Run nvidia-cuda-mps-control -d before launching any CUDA-using programs that should co-run.\n"); | ||
| 360 | // change mask on PID | ||
| 361 | uint128_t* mask_hndl = get_mask_hndl(target_pid); | ||
| 362 | if (!is_list) { | ||
| 363 | printf("PID %d's current TPC affinity mask: 0x%.0lx%016lx\n", target_pid, ~(uint64_t)(*mask_hndl >> 64), ~(uint64_t)*mask_hndl); | ||
| 364 | printf("PID %d's new TPC affinity mask: 0x%.0lx%016lx\n", target_pid, (uint64_t)(mask >> 64), (uint64_t)mask); | ||
| 365 | } else { | ||
| 366 | printf("PID %d's current TPC affinity list: %s\n", target_pid, compose_list(~*mask_hndl)); | ||
| 367 | printf("PID %d's new TPC affinity list: %s\n", target_pid, compose_list(mask)); | ||
| 186 | } | 368 | } |
| 369 | // Write the requested mask into the shared memory region | ||
| 370 | *mask_hndl = ~mask; | ||
| 187 | } | 371 | } |
| 188 | // Convert to string, prefix with ~, and set env var | 372 | break; |
| 189 | char mask_str[32+3+1]; // 32 hexits, "~0x", and '\0' | 373 | default: |
| 190 | snprintf(mask_str, 36, "~0x%lx%016lx", (uint64_t)(mask >> 64), (uint64_t)mask); | 374 | return ARGP_ERR_UNKNOWN; |
| 191 | //printf("nvtaskset: Using mask string %s\n", mask_str); | ||
| 192 | setenv("LIBSMCTRL_MASK", mask_str, 1); | ||
| 193 | // Start task | ||
| 194 | execvp(argv[3], argv+3); | ||
| 195 | error(1, errno, "Unable to launch task '%s'", argv[3]); | ||
| 196 | } else { | ||
| 197 | // Tell libsmctrl what mask to use | ||
| 198 | char* mask = malloc(strlen(argv[1]) + 2); | ||
| 199 | mask[0] = '~'; // Make an enable mask | ||
| 200 | strcpy(mask+1, argv[1]); | ||
| 201 | setenv("LIBSMCTRL_MASK", mask, 1); | ||
| 202 | free(mask); // setenv() made a copy | ||
| 203 | // Start task | ||
| 204 | execvp(argv[2], argv+2); | ||
| 205 | error(1, errno, "Unable to launch task '%s'", argv[2]); | ||
| 206 | } | ||
| 207 | } | 375 | } |
| 208 | fprintf(stderr, "Invalid arguments\n"); | 376 | return 0; |
| 209 | return 1; | 377 | } |
| 378 | |||
| 379 | struct argp argp = {opts, arg_parser, args_doc, desc}; | ||
| 380 | |||
| 381 | int main(int argc, char **argv) { | ||
| 382 | argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, NULL); | ||
| 383 | return 0; | ||
| 210 | } | 384 | } |
