path: root/nvdebug_entry.c



/* Copyright 2021 Joshua Bakita
 * SPDX-License-Identifier: MIT
 */

#include <linux/device.h>  // For struct device, bus_find_device*(), struct bus_type
#include <linux/interrupt.h> // For hooking the nvidia driver interrupts
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>  // For PCI device scanning
#include <linux/proc_fs.h>  // So we can set up entries in /proc

#include "nvdebug.h"
#include "stubs.h"

// Enable to intercept and log GPU interrupts
#define INTERRUPT_DEBUG 0

// MIT is GPL-compatible. We need to be GPL-compatible for symbols like
// platform_bus_type or bus_find_device_by_name...
MODULE_LICENSE("Dual MIT/GPL");
MODULE_AUTHOR("Joshua Bakita");
MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");

extern struct file_operations runlist_file_ops;
extern struct file_operations preempt_tsg_file_ops;
extern struct file_operations disable_channel_file_ops;
extern struct file_operations enable_channel_file_ops;
extern struct file_operations switch_to_tsg_file_ops;
extern struct file_operations device_info_file_ops;
extern struct file_operations nvdebug_read_reg32_file_ops;
extern struct file_operations nvdebug_read_reg_range_file_ops;

// Bus types are global symbols in the kernel
extern struct bus_type platform_bus_type;
struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
unsigned int g_nvdebug_devices = 0;

// Starting in Kernel 5.6, proc_ops is required instead of file_operations
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
// This rewrites the struct to the proc_ops layout on newer kernels
const struct proc_ops* compat_ops(const struct file_operations* ops) {
	struct proc_ops new_ops = {};
	new_ops.proc_open = ops->open;
	new_ops.proc_read = ops->read;
	new_ops.proc_write = ops->write;
	new_ops.proc_lseek = ops->llseek;
	new_ops.proc_release = ops->release;
	memcpy((void*)ops, &new_ops, sizeof(new_ops));
	return (struct proc_ops*)ops;
}
#else
const struct file_operations* compat_ops(const struct file_operations* ops) {
	return ops;
}
#endif

#if INTERRUPT_DEBUG
irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) {
	printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num);
	return IRQ_NONE; // We don't actually handle any interrupts. Pass them on.
}
#endif // INTERRUPT_DEBUG

// Find any and all NVIDIA GPUs in the system
// Note: This function fails if any of them are in a bad state
int probe_and_cache_device(void) {
	// platform bus (SoC) iterators
	struct device *dev = NULL;
	struct device *temp_dev;
	// PCI search iterator and search query
	struct pci_dev *pcid = NULL;
	// This query pattern is mirrored off nouveau
	struct pci_device_id query = {
		.vendor = NV_PCI_VENDOR,  // Match NVIDIA devices
		.device = PCI_ANY_ID,
		.subvendor = PCI_ANY_ID,
		.subdevice = PCI_ANY_ID,
		.class_mask = 0xff << 16,
		.class = PCI_BASE_CLASS_DISPLAY << 16,  // Match display devs
	};
	int i = 0;
	// Search the platform bus for the first device that matches our name
	// Search for GA10B (Jetson Orin)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.ga10b")))
		dev = temp_dev;
	// Search for GV11B (Jetson Xavier)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b")))
		dev = temp_dev;
	// Search for GP10B (Jetson TX2)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gp10b")))
		dev = temp_dev;
	// TODO: Support other platform bus devices (gk20a - TK1, gm20b - TX1)
	if (dev) {
		mc_boot_0_t ids;
		g_nvdebug_state[i].g = get_gk20a(dev);
		g_nvdebug_state[i].regs = gk20a_regs(g_nvdebug_state[i].g);
		if (!g_nvdebug_state[i].regs)
			return -EADDRNOTAVAIL;
		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
		if (ids.raw == -1)
			return -EADDRNOTAVAIL;
		g_nvdebug_state[i].chip_id = ids.chip_id;
		g_nvdebug_state[i].pcid = NULL;
		g_nvdebug_state[i].bar3 = NULL;
		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on platform bus and initialized.",
		       ids.chip_id, ARCH2NAME(ids.architecture));
		i++;
	}
	// Search the PCI bus and iterate through all matches
	// FIXME: State rollback
	while ((pcid = pci_get_dev_by_id(&query, pcid)) && i < NVDEBUG_MAX_DEVICES) {
		mc_boot_0_t ids;
		g_nvdebug_state[i].g = NULL;
		// Map BAR0 (GPU control registers)
		g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0);
		if (!g_nvdebug_state[i].regs) {
			pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n");
			return -EADDRNOTAVAIL;
		}
		// Map BAR3 (CPU-accessible mappings of GPU DRAM)
		g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, 0);
		// Try mapping only the lower half of BAR3 on fail
		// (vesafb may map the top half for display)
		if (!g_nvdebug_state[i].bar3)
			g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2);
		g_nvdebug_state[i].pcid = pcid;
		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
		if (ids.raw == -1) {
			pci_err(pcid, "[nvdebug] Unable to read config from Master Controller on this GPU\n");
			return -EADDRNOTAVAIL;
		}
		g_nvdebug_state[i].chip_id = ids.chip_id;
		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.",
		       ids.chip_id, ARCH2NAME(ids.architecture));
#if INTERRUPT_DEBUG
		if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) {
			printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n");
		}
#endif // INTERRUPT_DEBUG
		i++;
	}
	// Return the number of devices we found
	if (i > 0)
		return i;
	return -ENODEV;
}

// Create files `/proc/gpu#/runlist#`, world readable
int create_runlist_files(int device_id, struct proc_dir_entry *dir) {
	ptop_device_info_gk104_t info;
	struct proc_dir_entry *rl_entry;
	int i, rl_id;
	char runlist_name[12];
	int max_rl_id = 0; // Always at least one runlist
	// Figure out how many runlists there are by checking the device info
	// registers. Runlists are always numbered sequentially, so we just have
	// to find the highest-valued one and add 1 to get the number of runlists.
	for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_GK104; i++) {
		info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_GK104(i));
		if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid)
			continue;
		if (info.runlist_enum > max_rl_id)
			max_rl_id = info.runlist_enum;
	}
	// Create files to read each runlist. The read handling code looks at the
	// PDE_DATA associated with the file to determine what the runlist ID is.
	for (rl_id = 0; rl_id <= max_rl_id; rl_id++) {
		snprintf(runlist_name, 12, "runlist%d", rl_id);
		rl_entry = proc_create_data(
			runlist_name, 0444, dir, compat_ops(&runlist_file_ops),
			(void*)(uintptr_t)rl_id);
		if (!rl_entry)
			return -ENOMEM;
	}
	return 0;
}

// Create files /proc/gpu#
// TODO: Don't run this on unsupported GPUs
int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
	char file_name[20];
	int i;
	struct proc_dir_entry *gpc_tpc_mask_entry;
	// Get a bitmask of which GPCs are disabled
	uint32_t gpcs_mask = nvdebug_readl(&g_nvdebug_state[device_id], NV_FUSE_GPC);
	// Get maximum number of enabled GPCs for this chip
	uint32_t max_gpcs = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_SCAL_NUM_GPCS);
	// For each enabled GPC, expose a mask of disabled TPCs
	for (i = 0; i < max_gpcs; i++) {
		// Do nothing if GPC is disabled
		if ((1 << i) & gpcs_mask)
			continue;
		// If GPC is enabled, create an entry to read disabled TPCs mask
		snprintf(file_name, 20, "gpc%d_tpc_mask", i);
		gpc_tpc_mask_entry = proc_create_data(
			file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC(i));
		if (!gpc_tpc_mask_entry)
			return -ENOMEM;
	}
	return 0;
}

int __init nvdebug_init(void) {
	struct proc_dir_entry *dir, *preempt_entry, *disable_channel_entry,
			      *enable_channel_entry, *switch_to_tsg_entry, *device_info_entry,
			      *num_gpcs_entry, *lce_for_pce_entry, *grce_for_pce_entry;
	int rl_create_err, tpc_masks_create_err;
	// Check that an NVIDIA GPU is present and initialize g_nvdebug_state
	int res = probe_and_cache_device();
	if (res < 0)
		return res;
	g_nvdebug_devices = res;
	// Create seperate ProcFS directories for each gpu
	while (res--) {
		char device_id_str[7];
		uintptr_t device_id = res;  // This is uintptr as we abuse the *data field on proc_dir_entry to store the GPU id
		// Create directory /proc/gpu# where # is the GPU number
		snprintf(device_id_str, 7, "gpu%ld", device_id);
		if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id)))
			goto out_nomem;
		// Create files `/proc/gpu#/runlist#`, world readable
		if (g_nvdebug_state[device_id].chip_id < NV_CHIP_ID_AMPERE)
			create_runlist_files(device_id, dir);
		// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable
		tpc_masks_create_err = create_tpc_mask_files(device_id, dir);
		// Create file `/proc/gpu#/preempt_tsg`, world writable
		preempt_entry = proc_create_data(
			"preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/disable_channel`, world writable
		disable_channel_entry = proc_create_data(
			"disable_channel", 0222, dir, compat_ops(&disable_channel_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/enable_channel`, world writable
		enable_channel_entry = proc_create_data(
			"enable_channel", 0222, dir, compat_ops(&enable_channel_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/switch_to_tsg`, world writable
		switch_to_tsg_entry = proc_create_data(
			"switch_to_tsg", 0222, dir, compat_ops(&switch_to_tsg_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/device_info`, world readable
		device_info_entry = proc_create_data(
			"device_info", 0444, dir, compat_ops(&device_info_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/num_gpcs`, world readable
		num_gpcs_entry = proc_create_data(
			"num_gpcs", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_PTOP_SCAL_NUM_GPCS);
		// Create file `/proc/gpu#/num_tpc_per_gpc`, world readable
		num_gpcs_entry = proc_create_data(
			"num_tpc_per_gpc", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC);
		// Create file `/proc/gpu#/num_ces`, world readable
		num_gpcs_entry = proc_create_data(
			"num_ces", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_PTOP_SCAL_NUM_CES);
		// Create file `/proc/gpu#/num_ces`, world readable
		num_gpcs_entry = proc_create_data(
			"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_FUSE_GPC);
		// In both nouveau and nvgpu, the PCE_MAP register is  available on Pascal+
		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL){
			// Used for reading a subset of a register on pascal
			union reg_range pascal_reg;
			// Create a pce mask for iteration
			u32 ce_pce_map = nvdebug_readl(&g_nvdebug_state[device_id], NV_CE_PCE_MAP);
			char file_name[21];
			int pce_id = 0;
			int pce_num = 0;
			int i;
			for (pce_id = 0; pce_id < MAP_SIZE; pce_id++) {
				// If pce is enabled, create files and iterate pce_id; otherwise, do nothing
				if ((1 << pce_id) & ce_pce_map) {
					snprintf(file_name, 20, "lce_for_pce%d", pce_num);
					// Depending on GPU architecture, fetch data for the LCE of particular PCE
					switch (g_nvdebug_state[res].chip_id & 0xff0) {
						case NV_CHIP_ID_PASCAL:
							// On Pascal, two PCE configurations are packed per-byte.
							// Work around this by leveraging that we only run on 64-bit
							// platforms (can assume that a void* is 64-bits), and that
							// GPU register offsets are only 32-bits. Use the other 32
							// bits to store which bits to print.
							pascal_reg.offset = NV_LCE_FOR_PCE_GP100(0);
							pascal_reg.start_bit = pce_id * 4;
							pascal_reg.stop_bit = pce_id * 4 + 4;
							lce_for_pce_entry = proc_create_data(
											file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
											(void*)pascal_reg.raw);
							break;
						case NV_CHIP_ID_VOLTA:
						case NV_CHIP_ID_VOLTA_INTEGRATED:
						case NV_CHIP_ID_TURING:
							lce_for_pce_entry = proc_create_data(
											file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
											(void*)(uintptr_t)NV_LCE_FOR_PCE_GV100(pce_id));
							break;
						case NV_CHIP_ID_AMPERE:
						case NV_CHIP_ID_HOPPER:
						case NV_CHIP_ID_ADA:
							 lce_for_pce_entry = proc_create_data(
											file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
											(void*)(uintptr_t)NV_LCE_FOR_PCE_GA100(pce_id));
							break;
					}
					if (!lce_for_pce_entry)
						return -ENOMEM;
					pce_num++;
				}
			}
			// We assume 2 GRCEs (reminder: GRCE0 and 1 are just LCE0 and 1)
			for (i = 0; i < 2; i++) {
				union reg_range grce_reg = {0};
				snprintf(file_name, 21, "shared_lce_for_grce%d", i);
				// The offset used here is only documented for Turing
				// Actually, Pascal through Turing
				// On Pascal, it's only 3 bits, every 8 bits
				// On Volta-Turing, it start at same offset, but it's lower 4 bits, every 32 bits
				// On Ampere+ it starts at 0x001041c0, but is the same layout as Volta-Turing
				switch (g_nvdebug_state[res].chip_id & 0xff0) {
					case NV_CHIP_ID_PASCAL:
						grce_reg.offset = NV_GRCE_FOR_CE_GP100(0);
						grce_reg.start_bit = i * 8;
						grce_reg.stop_bit = grce_reg.start_bit + 3;
						break;
					case NV_CHIP_ID_VOLTA:
					case NV_CHIP_ID_VOLTA_INTEGRATED:
					case NV_CHIP_ID_TURING:
						grce_reg.offset = NV_GRCE_FOR_CE_GP100(i);
						grce_reg.start_bit = 0;
						grce_reg.stop_bit = grce_reg.start_bit + 4;
						break;
					case NV_CHIP_ID_AMPERE:
					case NV_CHIP_ID_HOPPER:
					case NV_CHIP_ID_ADA:
						grce_reg.offset = NV_GRCE_FOR_CE_GA100(i);
						grce_reg.start_bit = 0;
						grce_reg.stop_bit = grce_reg.start_bit + 4;
						break;
				}
				grce_for_pce_entry = proc_create_data(
								file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
								(void*)grce_reg.raw);
				if (!grce_for_pce_entry)
					return -ENOMEM;
			}

			// TODO: Redo to num_pces
			// Create file `/proc/gpu#/pce_map`, world readable
			num_gpcs_entry = proc_create_data(
				"pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)NV_CE_PCE_MAP);
		}
		// ProcFS entry creation only fails if out of memory
		if (rl_create_err || tpc_masks_create_err || !preempt_entry ||
		    !disable_channel_entry || !enable_channel_entry ||
		    !switch_to_tsg_entry || !device_info_entry || !num_gpcs_entry)
			goto out_nomem;
	}
	// (See Makefile if you want to know the origin of GIT_HASH.)
	printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n");
	return 0;
out_nomem:
	// Make sure to clear all ProcFS directories on error
	while (res < g_nvdebug_devices) {
		char device_id_str[7];
		snprintf(device_id_str, 7, "gpu%d", res);
		remove_proc_subtree(device_id_str, NULL);
		res++;
	}
	return -ENOMEM;
}

static void __exit nvdebug_exit(void) {
	struct nvdebug_state *g;
	// Deinitialize each device
	while (g_nvdebug_devices--) {
		// Remove procfs directory
		char device_id[7];
		snprintf(device_id, 7, "gpu%d", g_nvdebug_devices);
		remove_proc_subtree(device_id, NULL);
		// Free BAR mappings for PCIe devices
		g = &g_nvdebug_state[g_nvdebug_devices];
		if (g && g->pcid) {
			if (g && g->regs)
				pci_iounmap(g->pcid, g->regs);
			if (g && g->bar2)
				pci_iounmap(g->pcid, g->bar2);
#if INTERRUPT_DEBUG
			free_irq(g->pcid->irq, g->pcid);
#endif // INTERRUPT_DEBUG
		}
		printk(KERN_INFO "[nvdebug] Chip ID %x deinitialized.", g->chip_id);
	}
	printk(KERN_INFO "[nvdebug] Module exit complete.\n");
}

module_init(nvdebug_init);
module_exit(nvdebug_exit);