/**
 * Copyright 2023 Joshua Bakita
 * Library to control TPC masks on CUDA launches. Co-opts preexisting debug
 * logic in the CUDA driver library, and thus requires a build with -lcuda.
 */

#ifdef __cplusplus
extern "C" {
#endif

#include <stdint.h>

/* PARTITIONING FUNCTIONS */

// Set global default TPC mask for all kernels, incl. CUDA-internal ones
// @param mask   A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
// Supported: CUDA 10.2, and CUDA 11.0 - CUDA 12.1
extern void libsmctrl_set_global_mask(uint64_t mask);
// Set default TPC mask for all kernels launched via `stream`
// (overrides global mask)
// @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on
// @param mask   A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
// Supported: CUDA 8.0 - CUDA 12.1
extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
// Set TPC mask for the next kernel launch from the caller's CPU thread
// (overrides global and per-stream masks, applies only to next launch).
// @param mask   A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
// Supported: CUDA 11.0 - CUDA 12.1
extern void libsmctrl_set_next_mask(uint64_t mask);

/**
 * Notes on Bitmasks
 *
 * All of the core partitioning functions take a `uint64_t mask` parameter. A
 * set bit in the mask indicates that the respective Thread Processing Cluster
 * (TPC) is to be __disabled__.
 *
 * Examples
 * To prohibit the next kernel from using TPC 0:
 *     libsmctrl_set_next_mask(0x1);
 * Allow kernels to only use TPC 0 by default:
 *     libsmctrl_set_global_mask(~0x1ull);
 * Allow kernels in a stream to only use TPCs 2, 3, and 4:
 *     libsmctrl_set_stream_mask(stream, ~0b00111100ull);
 *
 * Note that the bitwise inversion operator (~, as used above) is very useful,
 * just be sure to apply it to 64-bit integer literals only! (~0x1 != ~0x1ull)
 */

/* INFORMATIONAL FUNCTIONS */

// Get number of GPCs for devices number `dev`, and a GPC-indexed array
// containing masks of which TPCs are associated with each GPC.
// Note that the `nvdebug` module must be loaded to use this function.
// @param  num_enabled_gpcs (out) Location to store number of GPCs in
// @param  tpcs_for_gpc     (out) Pointer to store pointer to output buffer at
// @param  dev               (in) `nvdebug` device ID
// @return 0 on success, `errno`-compatible error code on failure
extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev);
// Get total number of TPCs on device number `dev`. Requires `nvdebug`.
// @param  num_tpcs        (out) Location to store number of TPCs at
// @param  dev              (in) `nvdebug` device ID
// @return 0 on success, `errno`-compatible error code on failure
extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev);
// Identical to above, but for a CUDA device ID. Does not require `nvdebug`.
extern int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev);

#ifdef __cplusplus
}
#endif