libsmctrl.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791

/**
 * Copyright 2022-2025 Joshua Bakita
 * Library to control SM masks on CUDA launches. Co-opts preexisting debug
 * logic in the CUDA driver library, and thus requires a build with -lcuda.
 *
 * This file implements partitioning via three different mechanisms:
 * - Modifying the QMD/TMD immediately prior to upload
 * - Changing a field in CUDA's stream struct that CUDA applies to the QMD/TMD
 * This table shows the mechanism used with each CUDA version:
 *   +-----------+---------------+---------------+--------------+
 *   |  Version  |  Global Mask  |  Stream Mask  |  Next Mask   |
 *   +-----------+---------------+---------------+--------------+
 *   | 8.0-12.8  | TMD/QMD Hook  | stream struct | TMD/QMD Hook |
 *   | 6.5-7.5   | TMD/QMD Hook  | N/A           | TMD/QMD Hook |
 *   +-----------+---------------+---------------+--------------+
 * "N/A" indicates that a mask type is unsupported on that CUDA version.
 * Please contact the authors if support is needed for a particular feature on
 * an older CUDA version. Support for those is unimplemented, not impossible.
 *
 * An old implementation of this file affected the global mask on CUDA 10.2 by
 * changing a field in CUDA's global struct that CUDA applies to the QMD/TMD.
 * That implementation was extraordinarily complicated, and was replaced in
 * 2024 with a more-backward-compatible way of hooking the TMD/QMD.
 * View the old implementation via Git: `git show aa63a02e:libsmctrl.c`.
 */
#define _GNU_SOURCE // To enable use of memfd_create()
#include <cuda.h>

#include <errno.h>
#include <error.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/un.h>
#include <unistd.h>

#include "libsmctrl.h"

// In functions that do not return an error code, we favor terminating with an
// error rather than merely printing a warning and continuing.
#define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \
                                             __VA_ARGS__)

/*** QMD/TMD-based SM Mask Control via Debug Callback. ***/

// Tested working on x86_64 CUDA 6.5, 9.1, and various 10+ versions
// (No testing attempted on pre-CUDA-6.5 versions)
// Values for the following three lines can be extracted by tracing CUPTI as
// it interects with libcuda.so to set callbacks.
static const CUuuid callback_funcs_id = {{0x2c, (char)0x8e, 0x0a, (char)0xd8, 0x07, 0x10, (char)0xab, 0x4e, (char)0x90, (char)0xdd, 0x54, 0x71, (char)0x9f, (char)0xe5, (char)0xf7, 0x4b}};
// These callback descriptors appear to intercept the TMD/QMD late enough that
// CUDA has already applied the per-stream mask from its internal data
// structures, allowing us to override it with the next mask.
#define QMD_DOMAIN 0xb
#define QMD_PRE_UPLOAD 0x1
/**
 * These globals must be non-static (i.e., have global linkage) to ensure that
 * if multiple copies of the library are loaded (e.g., dynamically linked to
 * both this program and a dependency), secondary copies do not attempt to
 * repeat initialization or make changes to unused copies of mask values.
 */
// Supreme mask (cannot be overridden)
uint128_t *g_supreme_sm_mask = NULL;
// Global mask (applies across all threads)
uint64_t g_sm_mask = 0;
// Next mask (applies per-thread)
__thread uint64_t g_next_sm_mask = 0;
// Flag value to indicate if setup has been completed
bool sm_control_setup_called = false;

#ifdef LIBSMCTRL_STATIC
// Special handling for if built as a static library, and the libcuda.so.1
// libsmctrl wrapper is in use (see comment on setup() constructor for detail).
static void (*shared_set_global_mask)(uint64_t) = NULL;
static void (*shared_set_next_mask)(uint64_t) = NULL;
#endif

// v1 has been removed---it intercepted the TMD/QMD too early, making it
// impossible to override the CUDA-injected stream mask with the next mask.
static void control_callback_v2(void *ukwn, int domain, int cbid, const void *in_params) {
	// ***Only tested on platforms with 64-bit pointers.***
	// The first 8-byte element in `in_params` appears to be its size. `in_params`
	// must have at least five 8-byte elements for index four to be valid.
	if (*(uint32_t*)in_params < 5 * sizeof(void*))
		abort(1, 0, "Unsupported CUDA version for callback-based SM masking. Aborting...");
	// The fourth 8-byte element in `in_params` is a pointer to the TMD. Note
	// that this fourth pointer must exist---it only exists when the first
	// 8-byte element of `in_params` is at least 0x28 (checked above).
	void* tmd = *((void**)in_params + 4);
	if (!tmd)
		abort(1, 0, "TMD allocation appears NULL; likely forward-compatibilty issue.\n");

	uint32_t *lower_ptr, *upper_ptr, *ext_lower_ptr, *ext_upper_ptr;

	// The location of the TMD version field seems consistent across versions
	uint8_t tmd_ver = *(uint8_t*)(tmd + 72);

	if (tmd_ver >= 0x40) {
		// TMD V04_00 is used starting with Hopper to support masking >64 TPCs
		lower_ptr = tmd + 304;
		upper_ptr = tmd + 308;
		ext_lower_ptr = tmd + 312;
		ext_upper_ptr = tmd + 316;
		// XXX: Disable upper 64 TPCs until we have ...next_mask_ext and
		//      ...global_mask_ext
		*ext_lower_ptr = -1;
		*ext_upper_ptr = -1;
		// An enable bit is also required
		*(uint32_t*)tmd |= 0x80000000;
	} else if (tmd_ver >= 0x16) {
		// TMD V01_06 is used starting with Kepler V2, and is the first to
		// support TPC masking
		lower_ptr = tmd + 84;
		upper_ptr = tmd + 88;
	} else {
		// TMD V00_06 is documented to not support SM masking
		abort(1, 0, "TMD version %04o is too old! This GPU does not support SM masking.\n", tmd_ver);
	}

	// Setting the next mask overrides both per-stream and global masks
	if (g_next_sm_mask) {
		*lower_ptr = (uint32_t)g_next_sm_mask;
		*upper_ptr = (uint32_t)(g_next_sm_mask >> 32);
		g_next_sm_mask = 0;
	} else if (!*lower_ptr && !*upper_ptr){
		// Only apply the global mask if a per-stream mask hasn't been set
		*lower_ptr = (uint32_t)g_sm_mask;
		*upper_ptr = (uint32_t)(g_sm_mask >> 32);
	}

	// No one may override the supreme SM mask; any SMs disabled in it (set
	// bits) must always remain disabled.
	if (g_supreme_sm_mask) {
		*lower_ptr |= (uint32_t)*g_supreme_sm_mask;
		*upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 32);
		if (tmd_ver >= 0x40) {
			*ext_lower_ptr |= (uint32_t)(*g_supreme_sm_mask >> 64);
			*ext_upper_ptr |= (uint32_t)(*g_supreme_sm_mask >> 96);
		}
	}

	//fprintf(stderr, "Final SM Mask (lower): %x\n", *lower_ptr);
	//fprintf(stderr, "Final SM Mask (upper): %x\n", *upper_ptr);
}

static void setup_sm_control_callback() {
	int (*subscribe)(uint32_t* hndl, void(*callback)(void*, int, int, const void*), void* ukwn);
	int (*enable)(uint32_t enable, uint32_t hndl, int domain, int cbid);
	uintptr_t* tbl_base;
	uint32_t my_hndl;
	// Avoid race conditions (setup should only run once)
	if (__atomic_test_and_set(&sm_control_setup_called, __ATOMIC_SEQ_CST))
		return;

#if CUDA_VERSION <= 6050
	// Verify supported CUDA version
	// It's impossible for us to run with a version of CUDA older than we were
	// built by, so this check is excluded if built with CUDA > 6.5.
	int ver = 0;
	cuDriverGetVersion(&ver);
	if (ver < 6050)
		abort(1, ENOSYS, "Global or next masking requires at least CUDA 6.5; "
		                 "this application is using CUDA %d.%d",
		                 ver / 1000, (ver % 100));
#endif

	// Set up callback
	cuGetExportTable((const void**)&tbl_base, &callback_funcs_id);
	uintptr_t subscribe_func_addr = *(tbl_base + 3);
	uintptr_t enable_func_addr = *(tbl_base + 6);
	subscribe = (typeof(subscribe))subscribe_func_addr;
	enable = (typeof(enable))enable_func_addr;
	int res = 0;
	res = subscribe(&my_hndl, control_callback_v2, NULL);
	if (res)
		abort(1, 0, "Error subscribing to launch callback. CUDA returned error code %d.", res);
	res = enable(1, my_hndl, QMD_DOMAIN, QMD_PRE_UPLOAD);
	if (res)
		abort(1, 0, "Error enabling launch callback. CUDA returned error code %d.", res);
}

// Set default mask for all launches
void libsmctrl_set_global_mask(uint64_t mask) {
#ifdef LIBSMCTRL_STATIC
	// Special handling for if built as a static library, and the libcuda.so.1
	// libsmctrl wrapper is in use (see comment on setup() constructor for
	// detail).
	if (shared_set_global_mask)
		return (*shared_set_global_mask)(mask);
#endif
	setup_sm_control_callback();
	g_sm_mask = mask;
}

// Set mask for next launch from this thread
void libsmctrl_set_next_mask(uint64_t mask) {
#ifdef LIBSMCTRL_STATIC
	// Special handling for if built as a static library, and the libcuda.so.1
	// libsmctrl wrapper is in use (see comment on setup() constructor for
	// detail).
	if (shared_set_next_mask)
		return (*shared_set_next_mask)(mask);
#endif
	setup_sm_control_callback();
	g_next_sm_mask = mask;
}


/*** Per-Stream SM Mask (unlikely to be forward-compatible) ***/

// Offsets for the stream struct on x86_64
// No offset appears to work with CUDA 6.5 (tried 0x0--0x1b4 w/ 4-byte step)
// 6.5 tested on 340.118
#define CU_8_0_MASK_OFF 0xec
#define CU_9_0_MASK_OFF 0x130
// CUDA 9.0 and 9.1 use the same offset
// 9.1 tested on 390.157
#define CU_9_2_MASK_OFF 0x140
#define CU_10_0_MASK_OFF 0x244
// CUDA 10.0, 10.1 and 10.2 use the same offset
// 10.1 tested on 418.113
// 10.2 tested on 440.100, 440.82, 440.64, and 440.36
#define CU_11_0_MASK_OFF 0x274
#define CU_11_1_MASK_OFF 0x2c4
#define CU_11_2_MASK_OFF 0x37c
// CUDA 11.2, 11.3, 11.4, and 11.5 use the same offset
// 11.4 tested on 470.223.02
#define CU_11_6_MASK_OFF 0x38c
#define CU_11_7_MASK_OFF 0x3c4
#define CU_11_8_MASK_OFF 0x47c
// 11.8 tested on 520.56.06
#define CU_12_0_MASK_OFF 0x4cc
// CUDA 12.0 and 12.1 use the same offset
// 12.0 tested on 525.147.05
#define CU_12_2_MASK_OFF 0x4e4
// 12.2 tested on 535.129.03
#define CU_12_3_MASK_OFF 0x49c
// 12.3 tested on 545.29.06
#define CU_12_4_MASK_OFF 0x4ac
// 12.4 tested on 550.54.14 and 550.54.15
#define CU_12_5_MASK_OFF 0x4ec
// CUDA 12.5 and 12.6 use the same offset
// 12.5 tested on 555.58.02
// 12.6 tested on 560.35.03
#define CU_12_7_MASK_OFF 0x4fc
// CUDA 12.7 and 12.8 use the same offset
// 12.7 tested on 565.77
// 12.8 tested on 570.124.06

// Offsets for the stream struct on Jetson aarch64
#define CU_9_0_MASK_OFF_JETSON 0x128
// 9.0 tested on Jetpack 3.x (TX2, Nov 2023)
#define CU_10_2_MASK_OFF_JETSON 0x24c
// 10.2 tested on Jetpack 4.x (AGX Xaver and TX2, Nov 2023)
#define CU_11_4_MASK_OFF_JETSON 0x394
// 11.4 tested on Jetpack 5.x (AGX Orin, Nov 2023)
// TODO: 11.8, 12.0, 12.1, and 12.2 on Jetpack 5.x via compatibility packages
#define CU_12_2_MASK_OFF_JETSON 0x50c
// 12.2 tested on Jetpack 6.x (AGX Orin, Dec 2024)
#define CU_12_4_MASK_OFF_JETSON 0x4c4
// 12.4 tested on Jetpack 6.x with cuda-compat-12-4 (AGX Orin, Dec 2024)
#define CU_12_5_MASK_OFF_JETSON 0x50c
// 12.5 tested on Jetpack 6.x with cuda-compat-12-5 (AGX Orin, Dec 2024)
#define CU_12_6_MASK_OFF_JETSON 0x514
// 12.6 tested on Jetpack 6.x with cuda-compat-12-6 (AGX Orin, Dec 2024)

// Used up through CUDA 11.8 in the stream struct
struct stream_sm_mask {
	uint32_t upper;
	uint32_t lower;
};

// Used starting with CUDA 12.0 in the stream struct
struct stream_sm_mask_v2 {
	uint32_t enabled;
	uint32_t mask[4];
};

// Check if this system has a Parker SoC (TX2/PX2 chip)
// (CUDA 9.0 behaves slightly different on this platform.)
// @return 1 if detected, 0 if not, -cuda_err on error
#if __aarch64__
static int detect_parker_soc() {
	int cap_major, cap_minor, err, dev_count;
	if (err = cuDeviceGetCount(&dev_count))
		return -err;
	// As CUDA devices are numbered by order of compute power, check every
	// device, in case a powerful discrete GPU is attached (such as on the
	// DRIVE PX2). We detect the Parker SoC via its unique CUDA compute
	// capability: 6.2.
	for (int i = 0; i < dev_count; i++) {
		if (err = cuDeviceGetAttribute(&cap_minor,
		                               CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
		                               i))
			return -err;
		if (err = cuDeviceGetAttribute(&cap_major,
		                               CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
		                               i))
			return -err;
		if (cap_major == 6 && cap_minor == 2)
			return 1;
	}
	return 0;
}
#endif // __aarch64__

// Should work for CUDA 8.0 through 12.8
// A cudaStream_t is a CUstream*. We use void* to avoid a cuda.h dependency in
// our header
void libsmctrl_set_stream_mask(void* stream, uint64_t mask) {
	// When the old API is used on GPUs with over 64 TPCs, disable all TPCs >64
	uint128_t full_mask = -1;
	full_mask <<= 64;
	full_mask |= mask;
	libsmctrl_set_stream_mask_ext(stream, full_mask);
}

void libsmctrl_set_stream_mask_ext(void* stream, uint128_t mask) {
	char* stream_struct_base = *(char**)stream;
	struct stream_sm_mask* hw_mask = NULL;
	struct stream_sm_mask_v2* hw_mask_v2 = NULL;
	int ver;
	cuDriverGetVersion(&ver);
	switch (ver) {
#if __x86_64__
	case 8000:
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_8_0_MASK_OFF);
	case 9000:
	case 9010: {
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF);
		break;
	}
	case 9020:
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_2_MASK_OFF);
		break;
	case 10000:
	case 10010:
	case 10020:
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_0_MASK_OFF);
		break;
	case 11000:
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_0_MASK_OFF);
		break;
	case 11010:
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_1_MASK_OFF);
		break;
	case 11020:
	case 11030:
	case 11040:
	case 11050:
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_2_MASK_OFF);
		break;
	case 11060:
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_6_MASK_OFF);
		break;
	case 11070:
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_7_MASK_OFF);
		break;
	case 11080:
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_8_MASK_OFF);
		break;
	case 12000:
	case 12010:
		hw_mask_v2 = (void*)(stream_struct_base + CU_12_0_MASK_OFF);
		break;
	case 12020:
		hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF);
		break;
	case 12030:
		hw_mask_v2 = (void*)(stream_struct_base + CU_12_3_MASK_OFF);
		break;
	case 12040:
		hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF);
		break;
	case 12050:
	case 12060:
		hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF);
		break;
	case 12070:
	case 12080:
		hw_mask_v2 = (void*)(stream_struct_base + CU_12_7_MASK_OFF);
		break;
#elif __aarch64__
	case 9000: {
		// Jetson TX2 offset is slightly different on CUDA 9.0.
		// Only compile the check into ARM64 builds.
		// TODO: Always verify Jetson-board-only on aarch64.
		int is_parker;
		const char* err_str;
		if ((is_parker = detect_parker_soc()) < 0) {
			cuGetErrorName(-is_parker, &err_str);
			abort(1, 0, "While performing platform-specific "
			            "compatibility checks for stream masking, "
			            "CUDA call failed with error '%s'.", err_str);
		}

		if (!is_parker)
			abort(1, 0, "Not supported on non-Jetson aarch64.");
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_9_0_MASK_OFF_JETSON);
		break;
	}
	case 10020:
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_10_2_MASK_OFF_JETSON);
		break;
	case 11040:
		hw_mask = (struct stream_sm_mask*)(stream_struct_base + CU_11_4_MASK_OFF_JETSON);
		break;
	case 12020:
		hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF_JETSON);
		break;
	case 12040:
		hw_mask_v2 = (void*)(stream_struct_base + CU_12_4_MASK_OFF_JETSON);
		break;
	case 12050:
		hw_mask_v2 = (void*)(stream_struct_base + CU_12_5_MASK_OFF_JETSON);
		break;
	case 12060:
		hw_mask_v2 = (void*)(stream_struct_base + CU_12_6_MASK_OFF_JETSON);
		break;
#endif
	}

	// For experimenting to determine the right mask offset, set the MASK_OFF
	// environment variable (positive and negative numbers are supported)
	char* mask_off_str = getenv("MASK_OFF");
	if (mask_off_str) {
		int off = atoi(mask_off_str);
		fprintf(stderr, "libsmctrl: Attempting offset %d on CUDA 12.2 base %#x "
				"(total off: %#x)\n", off, CU_12_2_MASK_OFF, CU_12_2_MASK_OFF + off);
		if (CU_12_2_MASK_OFF + off < 0)
			abort(1, 0, "Total offset cannot be less than 0! Aborting...");
		// +4 bytes to convert a mask found with this for use with hw_mask
		hw_mask_v2 = (void*)(stream_struct_base + CU_12_2_MASK_OFF + off);
	}

	// Mask layout changed with CUDA 12.0 to support large Hopper/Ada GPUs
	if (hw_mask) {
		hw_mask->upper = mask >> 32;
		hw_mask->lower = mask;
	} else if (hw_mask_v2) {
		hw_mask_v2->enabled = 1;
		hw_mask_v2->mask[0] = mask;
		hw_mask_v2->mask[1] = mask >> 32;
		hw_mask_v2->mask[2] = mask >> 64;
		hw_mask_v2->mask[3] = mask >> 96;
	} else {
		abort(1, 0, "Stream masking unsupported on this CUDA version (%d), and"
		            " no fallback MASK_OFF set!", ver);
	}
}


/*** TPC and GPU Informational Functions ***/

// Read an integer from a file in `/proc`
static int read_int_procfile(char* filename, uint64_t* out) {
	char f_data[18] = {0};
	size_t ret;
	int fd = open(filename, O_RDONLY);
	if (fd == -1)
		return errno;
	ret = read(fd, f_data, 18);
	if (ret == -1)
		return errno;
	close(fd);
	*out = strtoll(f_data, NULL, 16);
	return 0;
}

// We support up to 128 TPCs, up to 12 GPCs per GPU, and up to 16 GPUs.
#define MAX_GPCS 12
static uint64_t tpc_mask_per_gpc_per_dev[16][MAX_GPCS];
static uint128_t tpc_mask_per_gpc_per_dev_ext[16][MAX_GPCS];
// Output mask is vtpc-indexed (virtual TPC)
// Note that this function has to undo _both_ floorsweeping and ID remapping
int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev) {
	int err, i;
	uint128_t *tpcs_for_gpc_ext;
	if ((err = libsmctrl_get_gpc_info_ext(num_enabled_gpcs, &tpcs_for_gpc_ext, dev)))
		return err;
	for (i = 0; i < *num_enabled_gpcs; i++) {
		if ((tpcs_for_gpc_ext[i] & -1ull) != tpcs_for_gpc_ext[i])
			return ERANGE;
		tpc_mask_per_gpc_per_dev[dev][i] = (uint64_t)tpcs_for_gpc_ext[i];
	}
	*tpcs_for_gpc = tpc_mask_per_gpc_per_dev[dev];
	return 0;
}

int libsmctrl_get_gpc_info_ext(uint32_t* num_enabled_gpcs, uint128_t** tpcs_for_gpc, int dev) {
	uint32_t i, j, tpc_id, gpc_id, num_enabled_tpcs, num_configured_tpcs;
	uint64_t gpc_mask, num_tpc_per_gpc, max_gpcs, gpc_tpc_mask, gpc_tpc_config, total_read = 0;
	uint128_t tpc_bit;
	int err;
	char filename[100];
	*num_enabled_gpcs = 0;
	// Maximum number of GPCs supported for this chip
	snprintf(filename, 100, "/proc/gpu%d/num_gpcs", dev);
	if (err = read_int_procfile(filename, &max_gpcs)) {
		fprintf(stderr, "libsmctrl: nvdebug module must be loaded into kernel before "
				"using libsmctrl_get_*_info() functions\n");
		return err;
	}
	// TODO: handle arbitrary-size GPUs
	if (dev > 16 || max_gpcs > 12) {
		fprintf(stderr, "libsmctrl: GPU possibly too large for preallocated map!\n");
		return ERANGE;
	}
	// Set bit = disabled GPC
	snprintf(filename, 100, "/proc/gpu%d/gpc_mask", dev);
	if (err = read_int_procfile(filename, &gpc_mask))
		return err;
	// Determine the number of enabled TPCs
	snprintf(filename, 100, "/proc/gpu%d/num_tpc_per_gpc", dev);
	if (err = read_int_procfile(filename, &num_tpc_per_gpc))
		return err;
	// For each enabled GPC
	num_enabled_tpcs = 0;
	for (i = 0; i < max_gpcs; i++) {
		// Skip this GPC if disabled
		if ((1 << i) & gpc_mask)
			continue;
		(*num_enabled_gpcs)++;
		// Get the bitstring of TPCs disabled for this physical GPC
		// Set bit = disabled TPC
		snprintf(filename, 100, "/proc/gpu%d/gpc%d_tpc_mask", dev, i);
		if (err = read_int_procfile(filename, &gpc_tpc_mask))
			return err;
		// Bits greater than the max number of TPCs should be ignored, so only
		// keep the `num_tpc_per_gpc`-count number of lower bits.
		gpc_tpc_mask &= -1u >> (64 - num_tpc_per_gpc);
		// Number of enabled TPCs = max - number disabled
		num_enabled_tpcs += num_tpc_per_gpc - __builtin_popcountl(gpc_tpc_mask);
	}
	// Clear any previous mask
	for (i = 0; i < MAX_GPCS; i++)
		tpc_mask_per_gpc_per_dev_ext[dev][i] = 0;
	// For each enabled TPC
	for (tpc_id = 0; tpc_id < num_enabled_tpcs;) {
		// Pull mapping for the next set of 4 TPCs
		snprintf(filename, 100, "/proc/gpu%d/CWD_GPC_TPC_ID%d", dev, tpc_id / 4);
		if (err = read_int_procfile(filename, &gpc_tpc_config))
			return err;
		total_read += gpc_tpc_config;
		for (j = 0; j < 4 && tpc_id < num_enabled_tpcs; j++, tpc_id++) {
			// Set the bit for the current TPC
			tpc_bit = 1;
			tpc_bit <<= tpc_id;
			// Determine which GPC the current TPC is associated with
			// (upper 4 bits of each byte)
			gpc_id = (gpc_tpc_config >> (j*8 + 4) & 0xfu);
			// Save mapping
			tpc_mask_per_gpc_per_dev_ext[dev][gpc_id] |= tpc_bit;
		}
	}
	// Verify each TPC is configured
	tpc_bit = 0;
	for (i = 0; i < MAX_GPCS; i++)
		tpc_bit |= tpc_mask_per_gpc_per_dev_ext[dev][i];
	num_configured_tpcs = __builtin_popcountl(tpc_bit) + __builtin_popcountl(tpc_bit >> 64);
	if (num_configured_tpcs != num_enabled_tpcs) {
		fprintf(stderr, "libsmctrl: Found configuration for only %d TPCs when %d were expected.\n", num_configured_tpcs, num_enabled_tpcs);
		return EIO;
	}
	// Verify that the configuration was not always zero (indicates a powered-
	// -off GPU).
	if (total_read == 0) {
		fprintf(stderr, "libsmctrl: Is GPU on? Configuration registers are all zero.\n");
		return EIO;
	}

	*tpcs_for_gpc = tpc_mask_per_gpc_per_dev_ext[dev];
	return 0;
}

int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev) {
	uint32_t num_gpcs;
	uint128_t* tpcs_per_gpc;
	int res, gpc;
	if (res = libsmctrl_get_gpc_info_ext(&num_gpcs, &tpcs_per_gpc, dev))
		return res;
	*num_tpcs = 0;
	for (gpc = 0; gpc < num_gpcs; gpc++) {
		*num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc]);
		*num_tpcs += __builtin_popcountl(tpcs_per_gpc[gpc] >> 64);
	}
	return 0;
}

// @param dev Device index as understood by CUDA **can differ from nvdebug idx**
// This implementation is fragile, and could be incorrect for odd GPUs
int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev) {
	int num_sms, sms_per_tpc, major, minor, res = 0;
	const char* err_str;
	if (res = cuInit(0))
		goto abort_cuda;
	if (res = cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_dev))
		goto abort_cuda;
	if (res = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_dev))
		goto abort_cuda;
	if (res = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_dev))
		goto abort_cuda;
	// SM masking only works on sm_35+
	if (major < 3 || (major == 3 && minor < 5))
		return ENOTSUP;
	// Everything newer than Pascal (as of Hopper) has 2 SMs per TPC, as well
	// as the P100, which is uniquely sm_60
	if (major > 6 || (major == 6 && minor == 0))
		sms_per_tpc = 2;
	else
		sms_per_tpc = 1;
	// It looks like there may be some upcoming weirdness (TPCs with only one SM?)
	// with Hopper
	if (major >= 9)
		fprintf(stderr, "libsmctrl: WARNING, TPC masking is untested on Hopper,"
				" and will likely yield incorrect results! Proceed with caution.\n");
	*num_tpcs = num_sms/sms_per_tpc;
	return 0;
abort_cuda:
	cuGetErrorName(res, &err_str);
	fprintf(stderr, "libsmctrl: CUDA call failed due to %s. Failing with EIO...\n", err_str);
	return EIO;
}


/*** Private functions for nvtaskset and building as a libcuda.so.1 wrapper ***/

// Check if NVIDIA MPS is running, following the process that `strace` shows
// `nvidia-cuda-mps-control` to use. MPS is a prerequisite to co-running
// multiple GPU-using tasks without timeslicing.
bool libsmctrl_is_mps_running() {
	char *mps_pipe_dir;
	int mps_ctrl;
	struct sockaddr_un mps_ctrl_addr;
	mps_ctrl_addr.sun_family = AF_UNIX;
	const int yes = 1;

	if (!(mps_pipe_dir = getenv("CUDA_MPS_PIPE_DIRECTORY")))
		mps_pipe_dir = "/tmp/nvidia-mps";
	// Pipe names are limited to 108 characters long
	snprintf(mps_ctrl_addr.sun_path, 108, "%s/control", mps_pipe_dir);
	// This mirrors the process `nvidia-cuda-mps-control` uses to detect MPS
	if ((mps_ctrl = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1)
		return false;
	if (setsockopt(mps_ctrl, SOL_SOCKET, SO_PASSCRED, &yes, sizeof(yes)) == -1)
		return false;
	if (connect(mps_ctrl, &mps_ctrl_addr, sizeof(struct sockaddr_un)) == -1)
		return false;
	close(mps_ctrl);
	return true;
}

// A variant of strtoul with support for 128-bit integers
uint128_t strtou128(const char *nptr, char **endptr, int base) {
	unsigned __int128 result = 0;
	if (base != 16)
		error(1, EINVAL, "strtou128 only supports base 16");
	// Skip a "0x" prefix. Safe due to early evaluation
	if (*nptr == '0' && (*(nptr + 1) == 'x' || *(nptr + 1) == 'X'))
		nptr += 2;
	// Until hitting an invalid character
	while (1) {
		if (*nptr >= 'a' && *nptr <= 'f')
			result = result << 4 | (*nptr - 'a' + 10);
		else if (*nptr >= 'A' && *nptr <= 'F')
			result = result << 4 | (*nptr - 'A' + 10);
		else if (*nptr >= '0' && *nptr <= '9')
			result = result << 4 | (*nptr - '0');
		else
			break;
		nptr++;
	}
	if (endptr)
		*endptr = (char*)nptr;
	return result;
}

#ifdef LIBSMCTRL_WRAPPER
// The CUDA runtime library uses dlopen() to load CUDA functions from
// libcuda.so.1. Since we replace that with our wrapper library, we need to
// also redirect any attempted opens of that shared object to the actual
// shared library, which is linked to by libcuda.so.
void *dlopen(const char *filename, int flags) {
	if (filename && strcmp(filename, "libcuda.so") == 0) {
		fprintf(stderr, "redirecting dlopen of %s to libcuda.so\n", filename);
		// A GNU-only dlopen variant
		return dlmopen(LM_ID_BASE, "libcuda.so", flags);
	} else
		return dlmopen(LM_ID_BASE, filename, flags);
}

// Allow setting a default mask via an environment variable
// Also enables libsmctrl to be used on unmodified programs via setting:
//   LD_LIBRARY_PATH=libsmctrl LIBSMCTRL_MASK=<your mask> ./my_program
// Where "<your mask>" is replaced with a disable mask, optionally prefixed
// with a ~ to invert it (make it an enable mask).
__attribute__((constructor)) static void setup(void) {
	char *end, *mask_str;
	// If dynamic changes are disabled (due to an error) this variable is
	// permanently used to store the supreme mask, rather than the shared
	// memory segment.
	static uint128_t mask;
	bool invert = false;
	int fd;

	mask_str = getenv("LIBSMCTRL_MASK");

	// Assume no mask if unspecified
	if (!mask_str)
		mask_str = "0";

	if (*mask_str == '~') {
		invert = true;
		mask_str++;
	}

	mask = strtou128(mask_str, &end, 16);
	// Verify we were able to parse the whole string
	if (*end != '\0')
		abort(1, EINVAL, "Unable to apply default mask");

	if (invert)
		mask = ~mask;

	// Explictly set the number of channels (if unset), otherwise CUDA will only
	// use two with MPS (see paper for why that causes problems)
	if (setenv("CUDA_DEVICE_MAX_CONNECTIONS", "8", 0) == -1)
		abort(1, EINVAL, "Unable to configure environment");

	// Warn if a mask was specified but MPS isn't running
	if (mask && !libsmctrl_is_mps_running())
		fprintf(stderr, "libsmctrl-libcuda-wrapper: Warning: TPC mask set via LIBSMCTRL_MASK, but NVIDIA MPS is not running. CUDA programs will not co-run!\n");

	// Initialize CUDA and the interception callback
	setup_sm_control_callback();

	// Create shared memory region for the supreme mask such that nvtaskset
	// can read and modify it
	fd = memfd_create("libsmctrl", MFD_CLOEXEC);
	if (fd == -1) {
		abort(0, errno, "Unable to create shared memory for dynamic partition changes. Dynamic changes disabled");
		g_supreme_sm_mask = &mask;
		return;
	}
	if (ftruncate(fd, 16) == -1) {
		abort(0, errno, "Unable to resize shared memory for dynamic partition changes. Dynamic changes disabled");
		g_supreme_sm_mask = &mask;
		return;
	}
	if ((g_supreme_sm_mask = mmap(NULL, 16, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)) == MAP_FAILED) {
		abort(0, errno, "Unable to map shared memory for dynamic partition changes. Dynamic changes disabled");
		g_supreme_sm_mask = &mask;
		return;
	}

	// Set the super-global mask which cannot be overwritten by any libsmctrl
	// API function.
	*g_supreme_sm_mask = mask;
}
#elif defined(LIBSMCTRL_STATIC)
// If this library is statically built into a program, and the libcuda.so.1
// wrapper is enabled, we force the staticlly linked version of the library
// to defer to the function implementations in the wrapper.
//
// Longer explanation:
// If the library has been dynamically linked into a program and the wrapper
// is in use, the loader will point both to the same set of symbols (since both
// will do a dynamic lookup at load-time, the global state at the top of this
// file uses global linkage, and will thus be in the dynamic symbol table, and
// each lookup will find the same copy.)
// Symbols from a staticlly linked library are not included in the dynamic
// symbol table, and thus can exist in duplicate of those in any shared
// library. This is a problem, since only one callback function, using one set
// of global variables can be registered with CUDA. We work around this by
// having our statically linked library use the functions from the wrapper or
// any shared library, if one such instance is loaded.
__attribute__((constructor)) static void setup(void) {
	// dlsym can only view the dynamic symbol tables, and so these lookups will
	// fail if neither the wrapper (libcuda.so.1) nor libsmctrl.so are loaded.
	// (That indicates that we should the static library implementations.)
	// These are a NOP on failure since they return NULL when not found.
	shared_set_next_mask = dlsym(RTLD_DEFAULT, "libsmctrl_set_next_mask");
	shared_set_global_mask = dlsym(RTLD_DEFAULT, "libsmctrl_set_global_mask");
}
#endif