2 files changed, 113 insertions, 4 deletions
diff --git a/Makefile b/Makefile
index aa59792..cfbd971 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,5 @@
 CC = gcc
+NVCC ?= nvcc
 # -fPIC is needed in all cases, as we may be linked into another shared library
 CFLAGS = -fPIC
 LDFLAGS = -lcuda -I/usr/local/cuda/include
@@ -12,10 +13,15 @@ libsmctrl.a: libsmctrl.c libsmctrl.h
        $(CC) $< -c -o libsmctrl.o $(CFLAGS) $(LDFLAGS)
        ar rcs $@ libsmctrl.o
-libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c
+# Use static linking with tests to avoid LD_LIBRARY_PATH issues
-        $(CC) $< -o $@ -L. -lsmctrl $(LDFLAGS)
+libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c libsmctrl.a
+        $(CC) $< -o $@ -g -L. -l:libsmctrl.a $(LDFLAGS)
-tests: libsmctrl_test_gpc_info
+libsmctrl_test_global_mask: libsmctrl_test_global_mask.cu libsmctrl.a
+        $(NVCC) $< -o $@ -g -L. -l:libsmctrl.a $(LDFLAGS)
+tests: libsmctrl_test_gpc_info libsmctrl_test_global_mask
 clean:
-        rm -f libsmctrl.so libsmctrl.a
+        rm -f libsmctrl.so libsmctrl.a libsmctrl_test_gpu_info \
+              libsmctrl_test_global_mask
diff --git a/libsmctrl_test_global_mask.cu b/libsmctrl_test_global_mask.cu
new file mode 100644
index 0000000..f6a487f
--- /dev/null
+++ b/libsmctrl_test_global_mask.cu
@@ -0,0 +1,103 @@
+#include <error.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <cuda_runtime.h>
+#include "libsmctrl.h"
+#include "testbench.h"
+__global__ void read_and_store_smid(uint8_t* smid_arr) {
+  if (threadIdx.x != 1)
+    return;
+  int smid;
+  asm("mov.u32 %0, %%smid;" : "=r"(smid));
+  smid_arr[blockIdx.x] = smid;
+}
+// Assuming SMs continue to support a maximum of 2048 resident threads, six
+// blocks of 1024 threads should span at least three SMs without partitioning
+#define NUM_BLOCKS 6
+int sort_asc(const void* a, const void* b) {
+  return *(uint8_t*)a - *(uint8_t*)b;
+}
+// Warning: Mutates input array via qsort
+int count_unique(uint8_t* arr, int len) {
+  qsort(arr, len, 1, sort_asc);
+  int num_uniq = 1;
+  for (int i = 0; i < len - 1; i++)
+    num_uniq += (arr[i] != arr[i + 1]);
+  return num_uniq;
+}
+int main() {
+  cudaError_t err; // Needed by SAFE() macro
+  int res;
+  uint8_t *smids_native_d, *smids_native_h;
+  uint8_t *smids_partitioned_d, *smids_partitioned_h;
+  int uniq_native, uniq_partitioned;
+  uint32_t num_tpcs;
+  int num_sms, sms_per_tpc;
+  // Determine number of SMs per TPC
+  SAFE(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
+  if (res = libsmctrl_get_tpc_info_cuda(&num_tpcs, 0))
+    error(1, res, "libsmctrl_test_global: Unable to get TPC configuration for test");
+  sms_per_tpc = num_sms/num_tpcs;
+  // Test baseline (native) behavior without partitioning
+  SAFE(cudaMalloc(&smids_native_d, NUM_BLOCKS));
+  if (!(smids_native_h = (uint8_t*)malloc(NUM_BLOCKS)))
+    error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test");
+  read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_native_d);
+  SAFE(cudaMemcpy(smids_native_h, smids_native_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
+  uniq_native = count_unique(smids_native_h, NUM_BLOCKS);
+  if (uniq_native < sms_per_tpc) {
+    printf("libsmctrl_test_global: ***Test failure.***\n"
+           "libsmctrl_test_global: Reason: In baseline test, %d blocks of 1024 "
+           "threads were launched on the GPU, but only %d SMs were utilized, "
+           "when it was expected that at least %d would be used.\n", NUM_BLOCKS,
+           uniq_native, sms_per_tpc);
+    return 1;
+  }
+  // Verify that partitioning changes the SMID distribution
+  libsmctrl_set_global_mask(~0x1); // Enable only one TPC
+  SAFE(cudaMalloc(&smids_partitioned_d, NUM_BLOCKS));
+  if (!(smids_partitioned_h = (uint8_t*)malloc(NUM_BLOCKS)))
+    error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test");
+  read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_partitioned_d);
+  SAFE(cudaMemcpy(smids_partitioned_h, smids_partitioned_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
+  // Make sure it only ran on the number of TPCs provided
+  // May run on up to two SMs, as up to two per TPC
+  uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS);
+  if (uniq_partitioned > sms_per_tpc) {
+    printf("libsmctrl_test_global: ***Test failure.***\n"
+           "libsmctrl_test_global: Reason: With global TPC mask set to "
+           "constrain all kernels to a single TPC, a kernel of %d blocks of "
+           "1024 threads was launched and found to run on %d SMs (at most %d---"
+           "one TPC---expected).\n", NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
+    return 1;
+  }
+  // Make sure it ran on the right TPC
+  if (smids_partitioned_h[NUM_BLOCKS - 1] > sms_per_tpc - 1) {
+    printf("libsmctrl_test_global: ***Test failure.***\n"
+           "libsmctrl_test_global: Reason: With global TPC mask set to"
+           "constrain all kernels to the first TPC, a kernel was run and found "
+           "to run on an SM ID as high as %d (max of %d expected).\n",
+           smids_partitioned_h[NUM_BLOCKS - 1], sms_per_tpc - 1);
+    return 1;
+  }
+  printf("libsmctrl_test_global: Test passed!\n"
+         "libsmctrl_test_global: Reason: With a global partition enabled which "
+         "contained only TPC ID 0, the test kernel was found to use only %d "
+         "SMs (%d without), and all SMs in-use had IDs below %d (were contained"
+         " in the first TPC).\n", uniq_partitioned, uniq_native, sms_per_tpc);
+  return 0;
+}

diff --git a/Makefile b/Makefile index aa59792..cfbd971 100644 --- a/Makefile +++ b/Makefile
@@ -1,4 +1,5 @@
1	CC = gcc	1	CC = gcc
		2	NVCC ?= nvcc
2	# -fPIC is needed in all cases, as we may be linked into another shared library	3	# -fPIC is needed in all cases, as we may be linked into another shared library
3	CFLAGS = -fPIC	4	CFLAGS = -fPIC
4	LDFLAGS = -lcuda -I/usr/local/cuda/include	5	LDFLAGS = -lcuda -I/usr/local/cuda/include
@@ -12,10 +13,15 @@ libsmctrl.a: libsmctrl.c libsmctrl.h
12	$(CC) $< -c -o libsmctrl.o $(CFLAGS) $(LDFLAGS)	13	$(CC) $< -c -o libsmctrl.o $(CFLAGS) $(LDFLAGS)
13	ar rcs $@ libsmctrl.o	14	ar rcs $@ libsmctrl.o
14		15
15	libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c	16	# Use static linking with tests to avoid LD_LIBRARY_PATH issues
16	$(CC) $< -o $@ -L. -lsmctrl $(LDFLAGS)	17	libsmctrl_test_gpc_info: libsmctrl_test_gpc_info.c libsmctrl.a
		18	$(CC) $< -o $@ -g -L. -l:libsmctrl.a $(LDFLAGS)
17		19
18	tests: libsmctrl_test_gpc_info	20	libsmctrl_test_global_mask: libsmctrl_test_global_mask.cu libsmctrl.a
		21	$(NVCC) $< -o $@ -g -L. -l:libsmctrl.a $(LDFLAGS)
		22
		23	tests: libsmctrl_test_gpc_info libsmctrl_test_global_mask
19		24
20	clean:	25	clean:
21	rm -f libsmctrl.so libsmctrl.a	26	rm -f libsmctrl.so libsmctrl.a libsmctrl_test_gpu_info \
		27	libsmctrl_test_global_mask


diff --git a/libsmctrl_test_global_mask.cu b/libsmctrl_test_global_mask.cu new file mode 100644 index 0000000..f6a487f --- /dev/null +++ b/libsmctrl_test_global_mask.cu
@@ -0,0 +1,103 @@
		1	#include <error.h>
		2	#include <errno.h>
		3	#include <stdio.h>
		4	#include <stdbool.h>
		5	#include <cuda_runtime.h>
		6
		7	#include "libsmctrl.h"
		8	#include "testbench.h"
		9
		10	__global__ void read_and_store_smid(uint8_t* smid_arr) {
		11	if (threadIdx.x != 1)
		12	return;
		13	int smid;
		14	asm("mov.u32 %0, %%smid;" : "=r"(smid));
		15	smid_arr[blockIdx.x] = smid;
		16	}
		17
		18	// Assuming SMs continue to support a maximum of 2048 resident threads, six
		19	// blocks of 1024 threads should span at least three SMs without partitioning
		20	#define NUM_BLOCKS 6
		21
		22	int sort_asc(const void* a, const void* b) {
		23	return (uint8_t)a - (uint8_t)b;
		24	}
		25
		26	// Warning: Mutates input array via qsort
		27	int count_unique(uint8_t* arr, int len) {
		28	qsort(arr, len, 1, sort_asc);
		29	int num_uniq = 1;
		30	for (int i = 0; i < len - 1; i++)
		31	num_uniq += (arr[i] != arr[i + 1]);
		32	return num_uniq;
		33	}
		34
		35	int main() {
		36	cudaError_t err; // Needed by SAFE() macro
		37	int res;
		38	uint8_t smids_native_d, smids_native_h;
		39	uint8_t smids_partitioned_d, smids_partitioned_h;
		40	int uniq_native, uniq_partitioned;
		41	uint32_t num_tpcs;
		42	int num_sms, sms_per_tpc;
		43
		44	// Determine number of SMs per TPC
		45	SAFE(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
		46	if (res = libsmctrl_get_tpc_info_cuda(&num_tpcs, 0))
		47	error(1, res, "libsmctrl_test_global: Unable to get TPC configuration for test");
		48	sms_per_tpc = num_sms/num_tpcs;
		49
		50	// Test baseline (native) behavior without partitioning
		51	SAFE(cudaMalloc(&smids_native_d, NUM_BLOCKS));
		52	if (!(smids_native_h = (uint8_t*)malloc(NUM_BLOCKS)))
		53	error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test");
		54	read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_native_d);
		55	SAFE(cudaMemcpy(smids_native_h, smids_native_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
		56
		57	uniq_native = count_unique(smids_native_h, NUM_BLOCKS);
		58	if (uniq_native < sms_per_tpc) {
		59	printf("libsmctrl_test_global: *Test failure.*\n"
		60	"libsmctrl_test_global: Reason: In baseline test, %d blocks of 1024 "
		61	"threads were launched on the GPU, but only %d SMs were utilized, "
		62	"when it was expected that at least %d would be used.\n", NUM_BLOCKS,
		63	uniq_native, sms_per_tpc);
		64	return 1;
		65	}
		66
		67	// Verify that partitioning changes the SMID distribution
		68	libsmctrl_set_global_mask(~0x1); // Enable only one TPC
		69	SAFE(cudaMalloc(&smids_partitioned_d, NUM_BLOCKS));
		70	if (!(smids_partitioned_h = (uint8_t*)malloc(NUM_BLOCKS)))
		71	error(1, errno, "libsmctrl_test_global: Unable to allocate memory for test");
		72	read_and_store_smid<<<NUM_BLOCKS, 1024>>>(smids_partitioned_d);
		73	SAFE(cudaMemcpy(smids_partitioned_h, smids_partitioned_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
		74
		75	// Make sure it only ran on the number of TPCs provided
		76	// May run on up to two SMs, as up to two per TPC
		77	uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS);
		78	if (uniq_partitioned > sms_per_tpc) {
		79	printf("libsmctrl_test_global: *Test failure.*\n"
		80	"libsmctrl_test_global: Reason: With global TPC mask set to "
		81	"constrain all kernels to a single TPC, a kernel of %d blocks of "
		82	"1024 threads was launched and found to run on %d SMs (at most %d---"
		83	"one TPC---expected).\n", NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
		84	return 1;
		85	}
		86
		87	// Make sure it ran on the right TPC
		88	if (smids_partitioned_h[NUM_BLOCKS - 1] > sms_per_tpc - 1) {
		89	printf("libsmctrl_test_global: *Test failure.*\n"
		90	"libsmctrl_test_global: Reason: With global TPC mask set to"
		91	"constrain all kernels to the first TPC, a kernel was run and found "
		92	"to run on an SM ID as high as %d (max of %d expected).\n",
		93	smids_partitioned_h[NUM_BLOCKS - 1], sms_per_tpc - 1);
		94	return 1;
		95	}
		96
		97	printf("libsmctrl_test_global: Test passed!\n"
		98	"libsmctrl_test_global: Reason: With a global partition enabled which "
		99	"contained only TPC ID 0, the test kernel was found to use only %d "
		100	"SMs (%d without), and all SMs in-use had IDs below %d (were contained"
		101	" in the first TPC).\n", uniq_partitioned, uniq_native, sms_per_tpc);
		102	return 0;
		103	}