libsmctrl_test_mask_shared.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

// Copyright 2023-2025 Joshua Bakita
#include <error.h>
#include <errno.h>
#include <stdio.h>
#include <stdbool.h>
#include <cuda_runtime.h>
#include <unistd.h> // For getpid()

#include "libsmctrl.h"
#include "testbench.h"
#include "libsmctrl_test_mask_shared.h"

__global__ void read_and_store_smid(uint8_t* smid_arr) {
  if (threadIdx.x != 1)
    return;
  int smid;
  asm("mov.u32 %0, %%smid;" : "=r"(smid));
  smid_arr[blockIdx.x] = smid;
}

// Need at least as many blocks as there are SMs on NVIDIA's biggest GPUs
#define NUM_BLOCKS 142

static int sort_asc(const void* a, const void* b) {
  return *(uint8_t*)a - *(uint8_t*)b;
}

// Warning: Mutates input array via qsort
static int count_unique(uint8_t* arr, int len) {
  qsort(arr, len, 1, sort_asc);
  int num_uniq = 1;
  for (int i = 0; i < len - 1; i++)
    num_uniq += (arr[i] != arr[i + 1]);
  return num_uniq;
}

// Test that adding an SM mask:
// 1. Constrains the number of SMs accessible
// 2. Constrains an application to the correct subset of SMs
int test_constrained_size_and_location(enum partitioning_type part_type) {
  int res;
  uint8_t *smids_native_d, *smids_native_h;
  uint8_t *smids_partitioned_d, *smids_partitioned_h;
  int uniq_native, uniq_partitioned;
  uint32_t num_tpcs;
  int num_sms, sms_per_tpc;
  cudaStream_t stream;

  SAFE(cudaStreamCreate(&stream));

  // Determine number of SMs per TPC
  SAFE(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
  if (res = libsmctrl_get_tpc_info_cuda(&num_tpcs, 0))
    error(1, res, "Unable to get TPC configuration for test");
  sms_per_tpc = num_sms/num_tpcs;

  // Test baseline (native) behavior without partitioning
  SAFE(cudaMalloc(&smids_native_d, NUM_BLOCKS));
  if (!(smids_native_h = (uint8_t*)malloc(NUM_BLOCKS)))
    error(1, errno, "Unable to allocate memory for test");
  read_and_store_smid<<<NUM_BLOCKS, 1024, 0, stream>>>(smids_native_d);
  SAFE(cudaMemcpy(smids_native_h, smids_native_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));

  uniq_native = count_unique(smids_native_h, NUM_BLOCKS);
  if (uniq_native < sms_per_tpc) {
    printf("%s: ***Test failure.***\n"
           "%s: Reason: In baseline test, %d blocks of 1024 "
           "threads were launched on the GPU, but only %d SMs were utilized, "
           "when it was expected that at least %d would be used.\n", program_invocation_name, program_invocation_name, NUM_BLOCKS,
           uniq_native, sms_per_tpc);
    return 1;
  }

  // Test at 32-TPC boundaries to verify that the mask is applied in the
  // correct order to each of the QMD/stream struct fields.
  char* reason[4] = {0};
  for (int enabled_tpc = 0; enabled_tpc < num_tpcs && enabled_tpc < 128; enabled_tpc += 32) {
    uint128_t mask = 1;
    mask <<= enabled_tpc;
    mask = ~mask;

    // Apply partitioning to enable only the first TPC of each 32-bit block
    switch (part_type) {
      case PARTITION_SUPREME:
        char cmd[80];
        // We must invert the mask before passing it to nvtaskset, since
        // nvtaskset takes an enable mask (as with the taskset command)
        snprintf(cmd, 80, "./nvtaskset -p 0x%.0lx%016lx %d > /dev/null", ~(uint64_t)(mask >> 64), ~(uint64_t)mask, getpid());
        system(cmd);
        break;
      case PARTITION_GLOBAL:
        libsmctrl_set_global_mask(mask);
        break;
      case PARTITION_STREAM:
        libsmctrl_set_stream_mask_ext(stream, mask);
        break;
      case PARTITION_STREAM_OVERRIDE:
        libsmctrl_set_global_mask(~mask);
        libsmctrl_set_stream_mask_ext(stream, mask);
        break;
      case PARTITION_NEXT:
        libsmctrl_set_next_mask(mask);
        break;
      case PARTITION_NEXT_OVERRIDE:
        libsmctrl_set_global_mask(~mask);
        libsmctrl_set_stream_mask_ext(stream, ~mask);
        libsmctrl_set_next_mask(mask);
        break;
      default:
        error(1, 0, "Shared test core called with unrecognized partitioning type.");
    }

    // Verify that partitioning changes the SMID distribution
    SAFE(cudaMalloc(&smids_partitioned_d, NUM_BLOCKS));
    if (!(smids_partitioned_h = (uint8_t*)malloc(NUM_BLOCKS)))
      error(1, errno, "Unable to allocate memory for test");
    read_and_store_smid<<<NUM_BLOCKS, 1024, 0, stream>>>(smids_partitioned_d);
    SAFE(cudaMemcpy(smids_partitioned_h, smids_partitioned_d, NUM_BLOCKS, cudaMemcpyDeviceToHost));

    // Make sure it only ran on the number of TPCs provided
    // May run on up to two SMs, as up to two per TPC
    uniq_partitioned = count_unique(smids_partitioned_h, NUM_BLOCKS); // Sorts too
    if (uniq_partitioned > sms_per_tpc) {
      printf("%s: ***Test failure.***\n"
             "%s: Reason: With a partition of only one TPC, the test kernel "
             "of %d blocks of 1024 threads ran on %d SMs (at most %d---one "
             "TPC---expected).\n", program_invocation_name, program_invocation_name, NUM_BLOCKS, uniq_partitioned, sms_per_tpc);
      return 1;
    }

    // Make sure it ran on the right TPC
    if (smids_partitioned_h[NUM_BLOCKS - 1] > (enabled_tpc * sms_per_tpc) + sms_per_tpc - 1 ||
        smids_partitioned_h[NUM_BLOCKS - 1] < (enabled_tpc * sms_per_tpc)) {
      printf("%s: ***Test failure.***\n"
             "%s: Reason: With a partition of only TPC %d, the test kernel "
             "ran on SM IDs as high as %d and as low as %d (range of %d to %d "
             "expected).\n", program_invocation_name, program_invocation_name, enabled_tpc, smids_partitioned_h[NUM_BLOCKS - 1], smids_partitioned_h[0], enabled_tpc * sms_per_tpc + sms_per_tpc - 1, enabled_tpc * sms_per_tpc);
      return 1;
    }

    // Div by 32 via a shift
    asprintf(&reason[enabled_tpc >> 5],
         "With a partition of only TPC %d, the test kernel used only %d "
         "SMs (%d without), and all had IDs between %d and %d (were contained"
         " in TPC %d).", enabled_tpc, uniq_partitioned, uniq_native, smids_partitioned_h[0], smids_partitioned_h[NUM_BLOCKS - 1], enabled_tpc);
  }

  printf("%s: Test passed!\n", program_invocation_name);
  for (int i = 0; i < 4 && reason[i]; i++)
    printf("%s: Reason %d: %s\n", program_invocation_name, i + 1, reason[i]);
  return 0;
}