From c250928930cb5c95bffc878913301f9a5d4efcb7 Mon Sep 17 00:00:00 2001
From: Joshua Bakita <bakitajoshua@gmail.com>
Date: Mon, 5 May 2025 03:13:30 -0400
Subject: De-duplicate CUDA version checks and omit when building with CUDA >
 6.5

Code built with CUDA > 6.5 cannot run on CUDA 6.5 or older, so the
check added unecessary overhead.

Tested on CUDA 6.5 and CUDA 10.2 to generate the correct code, and
global and next tested to work on GTX 1060 3 GB with either build
while using CUDA 10.2 at runtime.
---
 libsmctrl.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/libsmctrl.c b/libsmctrl.c
index 5a45611..5ee94fb 100644
--- a/libsmctrl.c
+++ b/libsmctrl.c
@@ -119,10 +119,23 @@ static void setup_sm_control_callback() {
 	int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid);
 	uintptr_t* tbl_base;
 	uint32_t my_hndl;
-	// Avoid race conditions (setup can only be called once)
+	// Avoid race conditions (setup should only run once)
 	if (__atomic_test_and_set(&sm_control_setup_called, __ATOMIC_SEQ_CST))
 		return;
 
+#if CUDA_VERSION <= 6050
+	// Verify supported CUDA version
+	// It's impossible for us to run with a version of CUDA older than we were
+	// built by, so this check is excluded if built with CUDA > 6.5.
+	int ver = 0;
+	cuDriverGetVersion(&ver);
+	if (ver < 6050)
+		abort(1, ENOSYS, "Global or next masking requires at least CUDA 6.5; "
+		                 "this application is using CUDA %d.%d",
+		                 ver / 1000, (ver % 100));
+#endif
+
+	// Set up callback
 	cuGetExportTable((const void**)&tbl_base, &callback_funcs_id);
 	uintptr_t subscribe_func_addr = *(tbl_base + 3);
 	uintptr_t enable_func_addr = *(tbl_base + 6);
@@ -139,31 +152,13 @@ static void setup_sm_control_callback() {
 
 // Set default mask for all launches
 void libsmctrl_set_global_mask(uint64_t mask) {
-	if (!sm_control_setup_called) {
-		// The version will not change while running, so only check once
-		int ver = 0;
-		cuDriverGetVersion(&ver);
-		if (ver < 6050)
-			abort(1, ENOSYS, "Global masking requires at least CUDA 6.5; "
-			                 "this application is using CUDA %d.%d",
-			                 ver / 1000, (ver % 100));
-		setup_sm_control_callback();
-	}
+	setup_sm_control_callback();
 	g_sm_mask = mask;
 }
 
 // Set mask for next launch from this thread
 void libsmctrl_set_next_mask(uint64_t mask) {
-	if (!sm_control_setup_called) {
-		// The version will not change while running, so only check once
-		int ver = 0;
-		cuDriverGetVersion(&ver);
-		if (ver < 6050)
-			abort(1, ENOSYS, "Next masking requires at least CUDA 6.5; "
-			                 "this application is using CUDA %d.%d",
-			                 ver / 1000, (ver % 100));
-		setup_sm_control_callback();
-	}
+	setup_sm_control_callback();
 	g_next_sm_mask = mask;
 }
 
-- 
cgit v1.2.2