2 files changed, 32 insertions, 7 deletions
diff --git a/libsmctrl.c b/libsmctrl.c
index 79d2b33..807cd6d 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -582,11 +582,11 @@ int libsmctrl_get_gpc_info_ext(uint32_t* num_enabled_gpcs, uint128_t** tpcs_for_
 int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {
        uint32_t num_gpcs;
        uint128_t* tpcs_per_gpc;
-        int res;
+        int res, gpc;
        if (res = libsmctrl_get_gpc_info_ext(&num_gpcs, &tpcs_per_gpc, dev))
                return res;
        *num_tpcs = 0;
-        for (int gpc = 0; gpc < num_gpcs; gpc++) {
+        for (gpc = 0; gpc < num_gpcs; gpc++) {
                *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]);
                *num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc] >> 64);
        }
@@ -596,7 +596,7 @@ int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {
 // @param dev Device index as understood by CUDA **can differ from nvdebug idx**
 // This implementation is fragile, and could be incorrect for odd GPUs
 int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) {
-        int num_sms, major, minor, res = 0;
+        int num_sms, sms_per_tpc, major, minor, res = 0;
        const char* err_str;
        if (res = cuInit(0))
                goto abort_cuda;
@@ -611,7 +611,6 @@ int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) {
                return ENOTSUP;
        // Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
        // as the P100, which is uniquely sm_60
-        int sms_per_tpc;
        if (major > 6 || (major == 6 && minor == 0))
                sms_per_tpc = 2;
        else
@@ -708,6 +707,7 @@ __attribute__((constructor)) static void setup(void) {
        // memory segment.
        static uint128_t mask;
        bool invert = false;
+        int fd;
        mask_str = getenv("LIBSMCTRL_MASK");
@@ -742,7 +742,7 @@ __attribute__((constructor)) static void setup(void) {
        // Create shared memory region for the supreme mask such that nvtaskset
        // can read and modify it
-        int fd = memfd_create("libsmctrl", MFD_CLOEXEC);
+        fd = memfd_create("libsmctrl", MFD_CLOEXEC);
        if (fd == -1) {
                abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled");
                g_supreme_sm_mask = &mask;
diff --git a/nvtaskset.c b/nvtaskset.c
index 4901cbe..5cf3a85 100644
--- a/nvtaskset.c
+++ b/nvtaskset.c
@@ -68,6 +68,25 @@ void libsmctrl_get_gpc_info_ext_easy(uint32_t* num_gpcs, uint128_t** masks, int
        int res;
        CUcontext ctx;
        char *old_order = NULL;
+        int old_stderr, dev_null_fd;
+        // Attempt to read the configuration, assuming the GPU is on, and fall
+        // back to creating a context if this fails.
+        // (Creating a CUDA context is very expensive and best avoided)
+        // (Redirect stderr while doing this to mute libsmctrl error messages)
+        if ((dev_null_fd = open("/dev/null", O_WRONLY)) == -1)
+                error(1, errno, "Unable to open /dev/null");
+        if (old_stderr = dup(STDERR_FILENO) == -1)
+                error(1, errno, "Unable to duplicate stderr file descriptor");
+        if (dup2(dev_null_fd, STDERR_FILENO) == -1)
+                error(1, errno, "Unable to overwrite stderr file descriptor");
+        res = libsmctrl_get_gpc_info_ext(num_gpcs, masks, gpu_id);
+        if (dup2(old_stderr, STDERR_FILENO) == -1)
+                error(1, errno, "Unable to restore stderr file descriptor");
+        // End if we were successful, otherwise fallback
+        if (res == 0)
+                return;
        // Tell CUDA to use PCI device id ordering (to match nvdebug)
        putenv((char*)"CUDA_DEVICE_ORDER=PCI_BUS_ID");
        // Allow CUDA to see all devices (to better match nvdebug)
@@ -80,12 +99,12 @@ void libsmctrl_get_gpc_info_ext_easy(uint32_t* num_gpcs, uint128_t** masks, int
        if ((res = cuInit(0))) {
                const char* name;
                cuGetErrorName(res, &name);
-                error(1, 0, "Unable to create a initialize CUDA, error %s\n", name);
+                error(1, 0, "Unable to create a initialize CUDA, error %s", name);
        }
        if ((res = cuCtxCreate(&ctx, 0, gpu_id))) {
                const char* name;
                cuGetErrorName(res, &name);
-                error(1, 0, "Unable to create a CUDA context, error %s\n", name);
+                error(1, 0, "Unable to create a CUDA context, error %s", name);
        }
        // Pull topology information from libsmctrl
        if ((res = libsmctrl_get_gpc_info_ext(num_gpcs, masks, gpu_id)) != 0) {
@@ -96,6 +115,12 @@ void libsmctrl_get_gpc_info_ext_easy(uint32_t* num_gpcs, uint128_t** masks, int
                        fprintf(stderr, "%s: Is the GPU powered on, i.e., is there an active context?\n", program_invocation_name);
                exit(1);
        }
+        // Delete the CUDA context
+        if (res = cuCtxDestroy(ctx)) {
+                const char* name;
+                cuGetErrorName(res, &name);
+                error(1, 0, "Unable to destroy CUDA context, error %s", name);
+        }
        // Restore the environment (in case we exec() later)
        unsetenv("CUDA_DEVICE_ORDER");
        if (old_order) {