1 files changed, 140 insertions, 0 deletions
diff --git a/libsmctrl_test_mask_shared.cu b/libsmctrl_test_mask_shared.cu
new file mode 100644
index 0000000..c3817fa
--- /dev/null
+++ b/libsmctrl_test_mask_shared.cu
@@ -0,0 +1,140 @@
+// Copyright 2023 Joshua Bakita
+#include <error.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <cuda_runtime.h>
+#include "libsmctrl.h"
+#include "testbench.h"
+#include "libsmctrl_test_mask_shared.h"
+__global__ void read_and_store_smid(uint8_t* smid_arr) {
+  if (threadIdx.x != 1)
+    return;
+  int smid;
+  asm("mov.u32 %0, %%smid;" : "=r"(smid));
+  smid_arr[blockIdx.x] = smid;
+}
+// Assuming SMs continue to support a maximum of 2048 resident threads, six
+// blocks of 1024 threads should span at least three SMs without partitioning
+#define NUM_BLOCKS 142 //6
+static int sort_asc(const void* a, const void* b) {
+  return *(uint8_t*)a - *(uint8_t*)b;
+}
+// Warning: Mutates input array via qsort
+static int count_unique(uint8_t* arr, int len) {
+  qsort(arr, len, 1, sort_asc);
+  int num_uniq = 1;
+  for (int i = 0; i < len - 1; i++)
+    num_uniq += (arr[i] != arr[i + 1]);
+  return num_uniq;
+}
+// Test that adding an SM mask:
+// 1. Constrains the number of SMs accessible
+// 2. Constrains an application to the correct subset of SMs
+int test_constrained_size_and_location(enum partitioning_type part_type) {
+  int res;
+  uint8_t *smids_native_d, *smids_native_h;
+  uint8_t *smids_partitioned_d, *smids_partitioned_h;
+  int uniq_native, uniq_partitioned;
+  uint32_t num_tpcs;
+  int num_sms, sms_per_tpc;
+  cudaStream_t stream;
+  SAFE(cudaStreamCreate(&stream));
+  // Determine number of SMs per TPC
+  SAFE(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
+  if (res = libsmctrl_get_tpc_info_cuda(&num_tpcs, 0))
+    error(1, res, "Unable to get TPC configuration for test");
+  sms_per_tpc = num_sms/num_tpcs;
+  // Test baseline (native) behavior without partitioning
+  SAFE(cudaMalloc(&smids_native_d, NUM_BLOCKS));
+  if (!(smids_native_h = (uint8_t*)malloc(NUM_BLOCKS)))
+    error(1, errno, "Unable to allocate memory for test");
+  read_and_store_smid<<<NUM_BLOCKS, 1024, 0, stream>>>(smids_native_d);
+  SAFE(cudaMemcpy(smids_native_h, smids_native_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
+  uniq_native = count_unique(smids_native_h, NUM_BLOCKS);
+  if (uniq_native < sms_per_tpc) {
+    printf("%s: ***Test failure.***\n"
+           "%s: Reason: In baseline test, %d blocks of 1024 "
+           "threads were launched on the GPU, but only %d SMs were utilized, "
+           "when it was expected that at least %d would be used.\n", program_invocation_name, program_invocation_name, NUM_BLOCKS,
+           uniq_native, sms_per_tpc);
+    return 1;
+  }
+  // Test at 32-TPC boundaries to verify that the mask is applied in the
+  // correct order to each of the QMD/stream struct fields.
+  char* reason[4] = {0};
+  for (int enabled_tpc = 0; enabled_tpc < num_tpcs && enabled_tpc < 128; enabled_tpc += 32) {
+    uint128_t mask = 1;
+    mask <<= enabled_tpc;
+    mask = ~mask;
+    // Apply partitioning to enable only the first TPC of each 32-bit block
+    switch (part_type) {
+      case PARTITION_GLOBAL:
+        libsmctrl_set_global_mask(mask);
+        break;
+      case PARTITION_STREAM:
+        libsmctrl_set_stream_mask_ext(stream, mask);
+        break;
+      case PARTITION_NEXT:
+        libsmctrl_set_next_mask(mask);
+        break;
+      default:
+        error(1, 0, "Shared test core called with unrecognized partitioning type.");
+    }
+    // Verify that partitioning changes the SMID distribution
+    SAFE(cudaMalloc(&smids_partitioned_d, NUM_BLOCKS));
+    if (!(smids_partitioned_h = (uint8_t*)malloc(NUM_BLOCKS)))
+      error(1, errno, "Unable to allocate memory for test");
+    read_and_store_smid<<<NUM_BLOCKS, 1024, 0, stream>>>(smids_partitioned_d);
+    SAFE(cudaMemcpy(smids_partitioned_h, smids_partitioned_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
+    // Make sure it only ran on the number of TPCs provided
+    // May run on up to two SMs, as up to two per TPC
+    uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS); // Sorts too
+    if (uniq_partitioned > sms_per_tpc) {
+      printf("%s: ***Test failure.***\n"
+             "%s: Reason: With TPC mask set to "
+             "constrain all kernels to a single TPC, a kernel of %d blocks of "
+             "1024 threads was launched and found to run on %d SMs (at most %d---"
+             "one TPC---expected).\n", program_invocation_name, program_invocation_name, NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
+      return 1;
+    }
+    // Make sure it ran on the right TPC
+    if (smids_partitioned_h[NUM_BLOCKS - 1] > (enabled_tpc * sms_per_tpc) + sms_per_tpc - 1 ||
+        smids_partitioned_h[NUM_BLOCKS - 1] < (enabled_tpc * sms_per_tpc)) {
+      printf("%s: ***Test failure.***\n"
+             "%s: Reason: With TPC mask set to"
+             "constrain all kernels to TPC %d, a kernel was run and found "
+             "to run on an SM IDs: as high as %d and as low as %d (range of %d to %d expected).\n",
+             program_invocation_name, program_invocation_name, enabled_tpc, smids_partitioned_h[NUM_BLOCKS - 1], smids_partitioned_h[0], enabled_tpc * sms_per_tpc + sms_per_tpc - 1, enabled_tpc * sms_per_tpc);
+      return 1;
+    }
+    // Div by 32 via a shift
+    asprintf(&reason[enabled_tpc >> 5],
+         "With a partition enabled which "
+         "contained only TPC ID %d, the test kernel was found to use only %d "
+         "SMs (%d without), and all SMs in-use had IDs between %d and %d (were contained"
+         " in TPC %d).", enabled_tpc, uniq_partitioned, uniq_native, smids_partitioned_h[0], smids_partitioned_h[NUM_BLOCKS - 1], enabled_tpc);
+  }
+  printf("%s: Test passed!\n", program_invocation_name);
+  for (int i = 0; i < 4 && reason[i]; i++)
+    printf("%s: Reason %d: %s\n", program_invocation_name, i + 1, reason[i]);
+  return 0;
+}

diff --git a/libsmctrl_test_mask_shared.cu b/libsmctrl_test_mask_shared.cu new file mode 100644 index 0000000..c3817fa --- /dev/null +++ b/libsmctrl_test_mask_shared.cu
@@ -0,0 +1,140 @@
	1	// Copyright 2023 Joshua Bakita
	2	#include <error.h>
	3	#include <errno.h>
	4	#include <stdio.h>
	5	#include <stdbool.h>
	6	#include <cuda_runtime.h>
	7
	8	#include "libsmctrl.h"
	9	#include "testbench.h"
	10	#include "libsmctrl_test_mask_shared.h"
	11
	12	__global__ void read_and_store_smid(uint8_t* smid_arr) {
	13	if (threadIdx.x != 1)
	14	return;
	15	int smid;
	16	asm("mov.u32 %0, %%smid;" : "=r"(smid));
	17	smid_arr[blockIdx.x] = smid;
	18	}
	19
	20	// Assuming SMs continue to support a maximum of 2048 resident threads, six
	21	// blocks of 1024 threads should span at least three SMs without partitioning
	22	#define NUM_BLOCKS 142 //6
	23
	24	static int sort_asc(const void* a, const void* b) {
	25	return (uint8_t)a - (uint8_t)b;
	26	}
	27
	28	// Warning: Mutates input array via qsort
	29	static int count_unique(uint8_t* arr, int len) {
	30	qsort(arr, len, 1, sort_asc);
	31	int num_uniq = 1;
	32	for (int i = 0; i < len - 1; i++)
	33	num_uniq += (arr[i] != arr[i + 1]);
	34	return num_uniq;
	35	}
	36
	37	// Test that adding an SM mask:
	38	// 1. Constrains the number of SMs accessible
	39	// 2. Constrains an application to the correct subset of SMs
	40	int test_constrained_size_and_location(enum partitioning_type part_type) {
	41	int res;
	42	uint8_t smids_native_d, smids_native_h;
	43	uint8_t smids_partitioned_d, smids_partitioned_h;
	44	int uniq_native, uniq_partitioned;
	45	uint32_t num_tpcs;
	46	int num_sms, sms_per_tpc;
	47	cudaStream_t stream;
	48
	49	SAFE(cudaStreamCreate(&stream));
	50
	51	// Determine number of SMs per TPC
	52	SAFE(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
	53	if (res = libsmctrl_get_tpc_info_cuda(&num_tpcs, 0))
	54	error(1, res, "Unable to get TPC configuration for test");
	55	sms_per_tpc = num_sms/num_tpcs;
	56
	57	// Test baseline (native) behavior without partitioning
	58	SAFE(cudaMalloc(&smids_native_d, NUM_BLOCKS));
	59	if (!(smids_native_h = (uint8_t*)malloc(NUM_BLOCKS)))
	60	error(1, errno, "Unable to allocate memory for test");
	61	read_and_store_smid<<<NUM_BLOCKS, 1024, 0, stream>>>(smids_native_d);
	62	SAFE(cudaMemcpy(smids_native_h, smids_native_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
	63
	64	uniq_native = count_unique(smids_native_h, NUM_BLOCKS);
	65	if (uniq_native < sms_per_tpc) {
	66	printf("%s: *Test failure.*\n"
	67	"%s: Reason: In baseline test, %d blocks of 1024 "
	68	"threads were launched on the GPU, but only %d SMs were utilized, "
	69	"when it was expected that at least %d would be used.\n", program_invocation_name, program_invocation_name, NUM_BLOCKS,
	70	uniq_native, sms_per_tpc);
	71	return 1;
	72	}
	73
	74	// Test at 32-TPC boundaries to verify that the mask is applied in the
	75	// correct order to each of the QMD/stream struct fields.
	76	char* reason[4] = {0};
	77	for (int enabled_tpc = 0; enabled_tpc < num_tpcs && enabled_tpc < 128; enabled_tpc += 32) {
	78	uint128_t mask = 1;
	79	mask <<= enabled_tpc;
	80	mask = ~mask;
	81
	82	// Apply partitioning to enable only the first TPC of each 32-bit block
	83	switch (part_type) {
	84	case PARTITION_GLOBAL:
	85	libsmctrl_set_global_mask(mask);
	86	break;
	87	case PARTITION_STREAM:
	88	libsmctrl_set_stream_mask_ext(stream, mask);
	89	break;
	90	case PARTITION_NEXT:
	91	libsmctrl_set_next_mask(mask);
	92	break;
	93	default:
	94	error(1, 0, "Shared test core called with unrecognized partitioning type.");
	95	}
	96
	97	// Verify that partitioning changes the SMID distribution
	98	SAFE(cudaMalloc(&smids_partitioned_d, NUM_BLOCKS));
	99	if (!(smids_partitioned_h = (uint8_t*)malloc(NUM_BLOCKS)))
	100	error(1, errno, "Unable to allocate memory for test");
	101	read_and_store_smid<<<NUM_BLOCKS, 1024, 0, stream>>>(smids_partitioned_d);
	102	SAFE(cudaMemcpy(smids_partitioned_h, smids_partitioned_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));
	103
	104	// Make sure it only ran on the number of TPCs provided
	105	// May run on up to two SMs, as up to two per TPC
	106	uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS); // Sorts too
	107	if (uniq_partitioned > sms_per_tpc) {
	108	printf("%s: *Test failure.*\n"
	109	"%s: Reason: With TPC mask set to "
	110	"constrain all kernels to a single TPC, a kernel of %d blocks of "
	111	"1024 threads was launched and found to run on %d SMs (at most %d---"
	112	"one TPC---expected).\n", program_invocation_name, program_invocation_name, NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
	113	return 1;
	114	}
	115
	116	// Make sure it ran on the right TPC
	117	if (smids_partitioned_h[NUM_BLOCKS - 1] > (enabled_tpc * sms_per_tpc) + sms_per_tpc - 1 \|\|
	118	smids_partitioned_h[NUM_BLOCKS - 1] < (enabled_tpc * sms_per_tpc)) {
	119	printf("%s: *Test failure.*\n"
	120	"%s: Reason: With TPC mask set to"
	121	"constrain all kernels to TPC %d, a kernel was run and found "
	122	"to run on an SM IDs: as high as %d and as low as %d (range of %d to %d expected).\n",
	123	program_invocation_name, program_invocation_name, enabled_tpc, smids_partitioned_h[NUM_BLOCKS - 1], smids_partitioned_h[0], enabled_tpc * sms_per_tpc + sms_per_tpc - 1, enabled_tpc * sms_per_tpc);
	124	return 1;
	125	}
	126
	127	// Div by 32 via a shift
	128	asprintf(&reason[enabled_tpc >> 5],
	129	"With a partition enabled which "
	130	"contained only TPC ID %d, the test kernel was found to use only %d "
	131	"SMs (%d without), and all SMs in-use had IDs between %d and %d (were contained"
	132	" in TPC %d).", enabled_tpc, uniq_partitioned, uniq_native, smids_partitioned_h[0], smids_partitioned_h[NUM_BLOCKS - 1], enabled_tpc);
	133	}
	134
	135	printf("%s: Test passed!\n", program_invocation_name);
	136	for (int i = 0; i < 4 && reason[i]; i++)
	137	printf("%s: Reason %d: %s\n", program_invocation_name, i + 1, reason[i]);
	138	return 0;
	139	}
	140