diff options
| author | Joshua Bakita <jbakita@cs.unc.edu> | 2025-07-03 13:41:36 -0400 |
|---|---|---|
| committer | Joshua Bakita <jbakita@cs.unc.edu> | 2025-07-03 13:49:18 -0400 |
| commit | 85c0b374fefa5bc8a5c3036ab79ba3df78fa71d2 (patch) | |
| tree | 0f0958b307d1d354ad02832a6244aa3fd3882c32 | |
| parent | 9b4b4f71fd843c3ec97ca6f55935675e62ca31f5 (diff) | |
Support installation on Linux4Tegraecrts25-ae
- Use alternate path if nvidia-cuda-mps-control is not found.
- Work-around missing CUDA_MPS_PIPE_DIRECTORY default in L4T.
- Install to L4T-specific paths if they exist.
Note that CUDA_MPS_PIPE_DIRECTORY must be manually set by the user
on L4T systems before they can call nvidia-cuda-mps-control, or to
have to have non-nvtaskset-launched applications run with MPS.
Tested on Jetson AGX Orin with L4T 36.3.0 and cuda-compat-12-6.
| -rw-r--r-- | Makefile | 39 | ||||
| -rw-r--r-- | nvtaskset.c | 27 |
2 files changed, 49 insertions, 17 deletions
| @@ -71,22 +71,35 @@ clean: | |||
| 71 | libsmctrl_test_next_mask libsmctrl_test_next_mask_override \ | 71 | libsmctrl_test_next_mask libsmctrl_test_next_mask_override \ |
| 72 | nvtaskset libcuda.so.1 | 72 | nvtaskset libcuda.so.1 |
| 73 | 73 | ||
| 74 | # On L4T (Linux4Tegra), the paths are different, and there may be multiple copies of libcuda.so.1 | ||
| 74 | install: libcuda.so.1 | 75 | install: libcuda.so.1 |
| 75 | @# Check that CUDA is installed first | 76 | @set -e -x; \ |
| 76 | test -f /usr/lib/$(ARCH)/libcuda.so.*.* | 77 | for DIR in /usr/lib/$(ARCH) /usr/local/cuda-*.*/compat /usr/lib/$(ARCH)/nvidia; do \ |
| 77 | @# Change libcuda.so link to bypass libcuda.so.1 | 78 | if [ ! -d $$DIR ]; then continue; fi; \ |
| 78 | sudo ln -sf /usr/lib/$(ARCH)/libcuda.so.*.* /usr/lib/$(ARCH)/libcuda.so | 79 | # Check that CUDA is installed in this location \ |
| 79 | @# Remove libcuda.so.1 symlink | 80 | if [ ! -f $$DIR/libcuda.so.*.* ]; then continue; fi; \ |
| 80 | sudo rm /usr/lib/$(ARCH)/libcuda.so.1 | 81 | # Change libcuda.so link to bypass libcuda.so.1 \ |
| 81 | @# Install wrapper as libcuda.so.1 | 82 | sudo ln -sf $$DIR/libcuda.so.*.* $$DIR/libcuda.so; \ |
| 82 | sudo cp libcuda.so.1 /usr/lib/$(ARCH)/libcuda.so.1 | 83 | # Remove libcuda.so.1 symlink \ |
| 84 | sudo rm $$DIR/libcuda.so.1; \ | ||
| 85 | # Install wrapper as libcuda.so.1 \ | ||
| 86 | sudo cp libcuda.so.1 $$DIR/libcuda.so.1; \ | ||
| 87 | done \ | ||
| 88 | # Special handling for L4T \ | ||
| 89 | if [ -d /usr/lib/$(ARCH)/nvidia ]; then sudo ln -sf nvidia/libcuda.so.1 /usr/lib/$(ARCH)/libcuda.so.1; fi | ||
| 83 | 90 | ||
| 84 | remove: | 91 | remove: |
| 85 | @# Test that our library in installed first | 92 | @set -e -x; \ |
| 86 | test ! -L /usr/lib/$(ARCH)/libcuda.so.1 | 93 | for DIR in /usr/lib/$(ARCH) /usr/local/cuda-*.*/compat /usr/lib/$(ARCH)/nvidia; do \ |
| 87 | @# Overwrite install with original symlinks | 94 | if [ ! -d $$DIR ]; then continue; fi; \ |
| 88 | sudo ln -sf libcuda.so.1 /usr/lib/$(ARCH)/libcuda.so | 95 | # Check that CUDA is installed in this location \ |
| 89 | sudo ln -sf /usr/lib/$(ARCH)/libcuda.so.*.* /usr/lib/$(ARCH)/libcuda.so.1 | 96 | if [ ! -f $$DIR/libcuda.so.*.* ]; then continue; fi; \ |
| 97 | # Test that our library in installed here \ | ||
| 98 | if [ -L $$DIR/libcuda.so.1 ]; then continue; fi; \ | ||
| 99 | # Overwrite install with original symlinks \ | ||
| 100 | sudo ln -sf libcuda.so.1 $$DIR/libcuda.so; \ | ||
| 101 | sudo ln -sf $$DIR/libcuda.so.*.* $$DIR/libcuda.so.1; \ | ||
| 102 | done | ||
| 90 | 103 | ||
| 91 | run_tests: tests | 104 | run_tests: tests |
| 92 | ./libsmctrl_test_global_mask | 105 | ./libsmctrl_test_global_mask |
diff --git a/nvtaskset.c b/nvtaskset.c index 5cf3a85..3a50221 100644 --- a/nvtaskset.c +++ b/nvtaskset.c | |||
| @@ -359,16 +359,34 @@ static error_t arg_parser(int key, char* arg, struct argp_state *state){ | |||
| 359 | else | 359 | else |
| 360 | printf("PID %d's current TPC affinity mask: 0x%.0lx%016lx\n", target_pid, (uint64_t)(enable_mask >> 64), (uint64_t)enable_mask); | 360 | printf("PID %d's current TPC affinity mask: 0x%.0lx%016lx\n", target_pid, (uint64_t)(enable_mask >> 64), (uint64_t)enable_mask); |
| 361 | } else if (is_cmd) { | 361 | } else if (is_cmd) { |
| 362 | if (!getenv("CUDA_MPS_PIPE_DIRECTORY")) { | ||
| 363 | // Pipe directory is not set by default on L4T aarch64 | ||
| 364 | putenv("CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps"); | ||
| 365 | } | ||
| 362 | // start MPS (as needed) | 366 | // start MPS (as needed) |
| 363 | if (!libsmctrl_is_mps_running()) { | 367 | if (!libsmctrl_is_mps_running()) { |
| 364 | fprintf(stderr, "nvtaskset: MPS control deamon does not appear to be running. Automatically starting...\n"); | 368 | fprintf(stderr, "nvtaskset: MPS control deamon does not appear to be running. Automatically starting...\n"); |
| 369 | // TODO: Mute the error message if this command isn't found? | ||
| 365 | int ret = system("nvidia-cuda-mps-control -d"); | 370 | int ret = system("nvidia-cuda-mps-control -d"); |
| 371 | // TODO: Fall back to full x86_64 install location? | ||
| 372 | // Fall back to full L4T aarch64 install location | ||
| 373 | if (ret == 0x7f00) { | ||
| 374 | // nvidia-cuda-mps-control needs nvidia-cuda-mps-server to be on PATH | ||
| 375 | char *old_path = getenv("PATH"); | ||
| 376 | char *new_path; | ||
| 377 | if (old_path) | ||
| 378 | asprintf(&new_path, "PATH=/usr/local/cuda/compat/:%s", old_path); | ||
| 379 | else | ||
| 380 | new_path = "PATH=/usr/local/cuda/compat/"; | ||
| 381 | putenv(new_path); | ||
| 382 | ret = system("nvidia-cuda-mps-control -d"); | ||
| 383 | // TODO: Put this warning after error checking | ||
| 384 | fprintf(stderr, "nvtaskset: Warning: Set the CUDA_MPS_PIPE_DIRECTORY environment variable to /tmp/nvidia-mps to ensure that subsequently launched tasks associate with MPS on L4T systems!\n"); | ||
| 385 | } | ||
| 366 | if (ret == -1) | 386 | if (ret == -1) |
| 367 | error(1, errno, "Unable to run subshell to start MPS"); | 387 | error(1, errno, "Unable to run subshell to start MPS"); |
| 368 | if (ret == 1) { | 388 | else if (ret) |
| 369 | fprintf(stderr, "nvtaskset: Error starting MPS control deamon. Terminating...\n"); | 389 | error(1, 0, "Error starting MPS control deamon. Terminating..."); |
| 370 | return 1; | ||
| 371 | } | ||
| 372 | fprintf(stderr, "nvtaskset: Done. Use \"echo quit | nvidia-cuda-mps-control\" to terminate it later as desired.\n"); | 390 | fprintf(stderr, "nvtaskset: Done. Use \"echo quit | nvidia-cuda-mps-control\" to terminate it later as desired.\n"); |
| 373 | } | 391 | } |
| 374 | // launch subprocess | 392 | // launch subprocess |
| @@ -377,6 +395,7 @@ static error_t arg_parser(int key, char* arg, struct argp_state *state){ | |||
| 377 | snprintf(mask_str, 36, "~0x%.0lx%016lx", (uint64_t)(mask >> 64), (uint64_t)mask); | 395 | snprintf(mask_str, 36, "~0x%.0lx%016lx", (uint64_t)(mask >> 64), (uint64_t)mask); |
| 378 | setenv("LIBSMCTRL_MASK", mask_str, 1); | 396 | setenv("LIBSMCTRL_MASK", mask_str, 1); |
| 379 | // Start task | 397 | // Start task |
| 398 | // TODO: Check that the loader is configured to find the corrrect libcuda.so.1 | ||
| 380 | execvp(sub_argv[0], sub_argv); | 399 | execvp(sub_argv[0], sub_argv); |
| 381 | error(1, errno, "Unable to launch task '%s'", sub_argv[0]); | 400 | error(1, errno, "Unable to launch task '%s'", sub_argv[0]); |
| 382 | } else { | 401 | } else { |
