diff options
author | Joshua Bakita <jbakita@cs.unc.edu> | 2025-06-25 20:01:19 -0400 |
---|---|---|
committer | Joshua Bakita <jbakita@cs.unc.edu> | 2025-06-25 20:43:38 -0400 |
commit | 9b4b4f71fd843c3ec97ca6f55935675e62ca31f5 (patch) | |
tree | e38f415c07415078e37ad8e1a0e08b6ff513114b /libsmctrl.c | |
parent | e0cb12762d048e388a81cffae3e96bfe2bd672cc (diff) |
Speed up nvtaskset by skipping CUDA context creation if possible
The GPU needs to be on before the GPC-to-TPC mapping registers can
be read. The easiest way to power on the GPU is to create a CUDA
context, but this is fairly expensive.
In nvtaskset, some GPU-using task will likely already be running,
and so we can skip CUDA context creation in the common case.
Bug fixes:
- Delete the temporary CUDA context created by nvtaskset after
it is done with it. Fixes bug where nvtaskset would leak this
context into any program it launches.
Additional changes:
- Style fixes in libsmctrl.c
- Remove superfluous newlines from error() calls in nvtaskset.c
Diffstat (limited to 'libsmctrl.c')
-rw-r--r-- | libsmctrl.c | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/libsmctrl.c b/libsmctrl.c index 79d2b33..807cd6d 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
@@ -582,11 +582,11 @@ int libsmctrl_get_gpc_info_ext(uint32_t* num_enabled_gpcs, uint128_t** tpcs_for_ | |||
582 | int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { | 582 | int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { |
583 | uint32_t num_gpcs; | 583 | uint32_t num_gpcs; |
584 | uint128_t* tpcs_per_gpc; | 584 | uint128_t* tpcs_per_gpc; |
585 | int res; | 585 | int res, gpc; |
586 | if (res = libsmctrl_get_gpc_info_ext(&num_gpcs, &tpcs_per_gpc, dev)) | 586 | if (res = libsmctrl_get_gpc_info_ext(&num_gpcs, &tpcs_per_gpc, dev)) |
587 | return res; | 587 | return res; |
588 | *num_tpcs = 0; | 588 | *num_tpcs = 0; |
589 | for (int gpc = 0; gpc < num_gpcs; gpc++) { | 589 | for (gpc = 0; gpc < num_gpcs; gpc++) { |
590 | *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]); | 590 | *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]); |
591 | *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc] >> 64); | 591 | *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc] >> 64); |
592 | } | 592 | } |
@@ -596,7 +596,7 @@ int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) { | |||
596 | // @param dev Device index as understood by CUDA **can differ from nvdebug idx** | 596 | // @param dev Device index as understood by CUDA **can differ from nvdebug idx** |
597 | // This implementation is fragile, and could be incorrect for odd GPUs | 597 | // This implementation is fragile, and could be incorrect for odd GPUs |
598 | int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) { | 598 | int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) { |
599 | int num_sms, major, minor, res = 0; | 599 | int num_sms, sms_per_tpc, major, minor, res = 0; |
600 | const char* err_str; | 600 | const char* err_str; |
601 | if (res = cuInit(0)) | 601 | if (res = cuInit(0)) |
602 | goto abort_cuda; | 602 | goto abort_cuda; |
@@ -611,7 +611,6 @@ int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) { | |||
611 | return ENOTSUP; | 611 | return ENOTSUP; |
612 | // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well | 612 | // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well |
613 | // as the P100, which is uniquely sm_60 | 613 | // as the P100, which is uniquely sm_60 |
614 | int sms_per_tpc; | ||
615 | if (major > 6 || (major == 6 && minor == 0)) | 614 | if (major > 6 || (major == 6 && minor == 0)) |
616 | sms_per_tpc = 2; | 615 | sms_per_tpc = 2; |
617 | else | 616 | else |
@@ -708,6 +707,7 @@ __attribute__((constructor)) static void setup(void) { | |||
708 | // memory segment. | 707 | // memory segment. |
709 | static uint128_t mask; | 708 | static uint128_t mask; |
710 | bool invert = false; | 709 | bool invert = false; |
710 | int fd; | ||
711 | 711 | ||
712 | mask_str = getenv("LIBSMCTRL_MASK"); | 712 | mask_str = getenv("LIBSMCTRL_MASK"); |
713 | 713 | ||
@@ -742,7 +742,7 @@ __attribute__((constructor)) static void setup(void) { | |||
742 | 742 | ||
743 | // Create shared memory region for the supreme mask such that nvtaskset | 743 | // Create shared memory region for the supreme mask such that nvtaskset |
744 | // can read and modify it | 744 | // can read and modify it |
745 | int fd = memfd_create("libsmctrl", MFD_CLOEXEC); | 745 | fd = memfd_create("libsmctrl", MFD_CLOEXEC); |
746 | if (fd == -1) { | 746 | if (fd == -1) { |
747 | abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled"); | 747 | abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled"); |
748 | g_supreme_sm_mask = &mask; | 748 | g_supreme_sm_mask = &mask; |