/* Copyright 2024 Joshua Bakita
* SPDX-License-Identifier: MIT
*/
#include <linux/device.h> // For struct device, bus_find_device*(), struct bus_type
#include <linux/interrupt.h> // For hooking the nvidia driver interrupts
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h> // For PCI device scanning
#include <linux/platform_device.h> // For platform_device struct
#include <linux/proc_fs.h> // So we can set up entries in /proc
#include "nvdebug_linux.h"
#include "stubs.h"
// Enable to intercept and log GPU interrupts. Historically used to benchmark
// interrupt latency.
#define INTERRUPT_DEBUG 0
// MIT is GPL-compatible. We need to be GPL-compatible for symbols like
// platform_bus_type or bus_find_device_by_name...
MODULE_LICENSE("Dual MIT/GPL");
MODULE_AUTHOR("Joshua Bakita");
MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");
// runlist_procfs.c
extern struct file_operations runlist_file_ops;
extern struct file_operations preempt_tsg_file_ops;
extern struct file_operations disable_channel_file_ops;
extern struct file_operations enable_channel_file_ops;
extern struct file_operations resubmit_runlist_file_ops;
extern struct file_operations switch_to_tsg_file_ops;
// device_info_procfs.c
extern struct file_operations device_info_file_ops;
extern struct file_operations nvdebug_read_reg32_file_ops;
extern struct file_operations nvdebug_read_reg_range_file_ops;
extern struct file_operations local_memory_file_ops;
// copy_topology_procfs.c
extern struct file_operations copy_topology_file_ops;
struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
unsigned int g_nvdebug_devices = 0;
// Bus types are global symbols in the kernel
extern struct bus_type platform_bus_type;
// Starting in Kernel 5.6, proc_ops is required instead of file_operations.
// As file_operations is larger than proc_ops, we can overwrite the memory
// backing the file_operations struct to follow the proc_ops layout, and then
// cast on newer kernels.
// We use the last byte of the file_operations struct to flag that the memory
// layout has been rearranged.
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
const struct proc_ops* compat_ops(const struct file_operations* ops) {
struct proc_ops new_ops = {};
// Don't re-layout if it's already been done
if (*((uint8_t*)(ops + 1) - 1))
return (struct proc_ops*)ops;
new_ops.proc_open = ops->open;
new_ops.proc_read = ops->read;
new_ops.proc_write = ops->write;
new_ops.proc_lseek = ops->llseek;
new_ops.proc_release = ops->release;
memcpy((void*)ops, &new_ops, sizeof(new_ops));
// Flag re-layout as complete in last byte of structure
*((uint8_t*)(ops + 1) - 1) = 1;
return (struct proc_ops*)ops;
}
#else
const struct file_operations* compat_ops(const struct file_operations* ops) {
return ops;
}
#endif
#if INTERRUPT_DEBUG
irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) {
printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num);
return IRQ_NONE; // We don't actually handle any interrupts. Pass them on.
}
#endif // INTERRUPT_DEBUG
// Find any and all NVIDIA GPUs in the system
// Note: This function fails if any of them are in a bad state
int probe_and_cache_devices(void) {
// platform bus (SoC) iterators
struct device *dev = NULL;
struct device *temp_dev;
// PCI search iterator and search query
struct pci_dev *pcid = NULL;
// This query pattern is mirrored off nouveau
struct pci_device_id query = {
.vendor = NV_PCI_VENDOR, // Match NVIDIA devices
.device = PCI_ANY_ID,
.subvendor = PCI_ANY_ID,
.subdevice = PCI_ANY_ID,
.class_mask = 0xff << 16,
.class = PCI_BASE_CLASS_DISPLAY << 16, // Match display devs
};
int i = 0;
// Search the platform bus for the first device that matches our name
// Search for embedded GPU on Jetson (generic name starting around L4T 36.3)
while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gpu")))
dev = temp_dev;
// Search for GA10B (Jetson Orin)
while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.ga10b")))
dev = temp_dev;
// Search for GV11B (Jetson Xavier)
while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b")))
dev = temp_dev;
// Search for GP10B (Jetson TX2)
while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gp10b")))
dev = temp_dev;
// Search for GM20B (Jetson TX1)
while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "57000000.gpu")))
dev = temp_dev;
// TODO: Support other platform bus devices (gk20a - TK1)
if (dev) {
mc_boot_0_t ids;
struct platform_device *platd = container_of(dev, struct platform_device, dev);
struct resource *regs = platform_get_resource(platd, IORESOURCE_MEM, 0);
g_nvdebug_state[i].g = get_gk20a(dev);
if (!regs)
return -EADDRNOTAVAIL;
g_nvdebug_state[i].regs = ioremap(regs->start, resource_size(regs));
if (!g_nvdebug_state[i].regs) {
printk(KERN_ERR "[nvdebug] Unable to map BAR0 on the integrated GPU\n");
return -EADDRNOTAVAIL;
}
// The Jetson TX1, TX2, Xavier, and Orin do not have a BAR2 (but do have
// BAR1). On the TX2+, all their platform resources are:
// [nvdebug] Region 0: Memory at 17000000 [size=16777216]
// [nvdebug] Region 1: Memory at 18000000 [size=16777216]
// [nvdebug] Region 2: Memory at 3b41000 [size=4096]
// The TX1 has the same regions, but at different base addresses.
g_nvdebug_state[i].bar3 = NULL;
g_nvdebug_state[i].pcid = NULL;
g_nvdebug_state[i].platd = platd;
g_nvdebug_state[i].dev = dev;
// Don't check Chip ID until everything else is initalized
ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
if (ids.raw == -1) {
printk(KERN_ERR "[nvdebug] Unable to read config from Master Controller on the integrated GPU\n");
return -EADDRNOTAVAIL;
}
g_nvdebug_state[i].chip_id = ids.chip_id;
printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on platform bus and initialized.",
ids.chip_id, ARCH2NAME(ids.architecture));
i++;
}
// Search the PCI bus and iterate through all matches
// FIXME: Undo the pci_iomap() if this fails
while ((pcid = pci_get_dev_by_id(&query, pcid)) && i < NVDEBUG_MAX_DEVICES) {
mc_boot_0_t ids;
g_nvdebug_state[i].g = NULL;
// Map BAR0 (GPU control registers)
g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0);
if (!g_nvdebug_state[i].regs) {
pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n");
return -EADDRNOTAVAIL;
}
// Map BAR3 (CPU-accessible mappings of GPU DRAM)
g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, 0);
// XXX: Try mapping only the lower half of BAR3 on fail
// (vesafb may map the top half for display)
if (!g_nvdebug_state[i].bar3)
g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2);
g_nvdebug_state[i].pcid = pcid;
g_nvdebug_state[i].platd = NULL;
g_nvdebug_state[i].dev = &pcid->dev;
// Don't check Chip ID until everything else is initalized
ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
if (ids.raw == -1) {
pci_err(pcid, "[nvdebug] Unable to read config from Master Controller on this GPU\n");
return -EADDRNOTAVAIL;
}
g_nvdebug_state[i].chip_id = ids.chip_id;
printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.",
ids.chip_id, ARCH2NAME(ids.architecture));
#if INTERRUPT_DEBUG
if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) {
printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n");
}
#endif // INTERRUPT_DEBUG
i++;
}
// Return the number of devices found
if (i > 0)
return i;
return -ENODEV;
}
// Support: Fermi, Maxwell, Pascal, Volta, Turing
int get_last_runlist_id_gk104(struct nvdebug_state *g) {
ptop_device_info_gk104_t info;
int i, max_rl_id = 0; // Always at least one runlist
// Figure out how many runlists there are by checking the device info
// registers. Runlists are always numbered sequentially, so we just have
// to find the highest-valued one and add 1 to get the number of runlists.
for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_GK104; i++) {
if ((info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GK104(i))) == -1)
return -EIO;
if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid)
continue;
if (info.runlist_enum > max_rl_id)
max_rl_id = info.runlist_enum;
}
return max_rl_id;
}
// Support: Ampere, Hopper, Ada (and newer likely)
// Identical structure to get_runlist_ram() in runlist.c. See comments there.
int get_last_runlist_id_ga100(struct nvdebug_state *g) {
ptop_device_info_ga100_t ptop_entry;
int i, runlist_count = 0;
int ptop_size = NV_PTOP_DEVICE_INFO__SIZE_1_GA100(g);
int ptop_entry_subrow = 0;
for (i = 0; i < ptop_size; i++) {
if ((ptop_entry.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GA100(i))) == -1)
return -EIO;
if (!ptop_entry.raw)
continue;
if (ptop_entry_subrow == 2 && ptop_entry.rleng_id == 0)
runlist_count++;
if (ptop_entry.has_next_entry)
ptop_entry_subrow += 1;
else
ptop_entry_subrow = 0;
}
return runlist_count - 1;
}
// Return the maximum runlist ID. For a two-runlist GPU, this would return 1.
int get_last_runlist_id(int device_id) {
struct nvdebug_state* g = &g_nvdebug_state[device_id];
if (g->chip_id >= NV_CHIP_ID_AMPERE)
return get_last_runlist_id_ga100(g);
else
return get_last_runlist_id_gk104(g);
}
// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable
// Support: Maxwell+
int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
struct nvdebug_state* g = &g_nvdebug_state[device_id];
char file_name[20];
int i;
struct proc_dir_entry *gpc_tpc_mask_entry;
// Get maximum number of enabled GPCs for this chip
uint32_t max_gpcs = nvdebug_readl(g, NV_PTOP_SCAL_NUM_GPCS);
// Get a bitmask of which GPCs are disabled
uint32_t gpcs_mask;
if (g->chip_id < NV_CHIP_ID_AMPERE)
gpcs_mask = nvdebug_readl(g, NV_FUSE_GPC_GM107);
else
gpcs_mask = nvdebug_readl(g, NV_FUSE_GPC_GA100);
// Verify the reads succeeded
if (max_gpcs == -1 || gpcs_mask == -1)
return -EIO;
// For each enabled GPC, expose a mask of disabled TPCs
for (i = 0; i < max_gpcs; i++) {
// Do nothing if GPC is disabled
if ((1 << i) & gpcs_mask)
continue;
// If GPC is enabled, create an entry to read disabled TPCs mask
snprintf(file_name, 20, "gpc%d_tpc_mask", i);
if (g->chip_id < NV_CHIP_ID_AMPERE)
gpc_tpc_mask_entry = proc_create_data(
file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC_GM107(i));
else
gpc_tpc_mask_entry = proc_create_data(
file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC_GA100(i));
if (!gpc_tpc_mask_entry)
return -ENOMEM;
}
return 0;
}
int __init nvdebug_init(void) {
struct proc_dir_entry *dir;
int err, res;
// Check that an NVIDIA GPU is present and initialize g_nvdebug_state
if ((res = probe_and_cache_devices()) < 0)
return res;
g_nvdebug_devices = res;
// Create seperate ProcFS directories for each gpu
while (res--) {
uintptr_t last_runlist = 0;
char device_id_str[7];
// Create a wider copy of the GPU ID to allow us to abuse the *data
// field of proc_dir_entry to store the GPU ID.
uintptr_t device_id = res;
// Create directory /proc/gpu# where # is the GPU number
// As ProcFS entry creation only fails if out of memory, we auto-skip
// to handling that on any error in creating ProcFS files.
snprintf(device_id_str, 7, "gpu%ld", device_id);
if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id)))
goto out_nomem;
// Create files in the `/proc/gpu#/runlist#/` directory
// The read handling code looks at the `pde_data` associated with the parent
// directory to determine what the runlist ID is.
if ((last_runlist = get_last_runlist_id(device_id)) < 0)
return last_runlist;
do {
char runlist_name[12];
struct proc_dir_entry *rl_dir;
// Create `/proc/gpu#/runlist#` directory
snprintf(runlist_name, 12, "runlist%lu", last_runlist);
if (!(rl_dir = proc_mkdir_data(runlist_name, 0555, dir, (void*)device_id)))
goto out_nomem;
// Create one file for each runlist on Ampere+, or one file for each GPU on older
if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_AMPERE || last_runlist == 0) {
struct proc_dir_entry *chram_scope;
// preempt_tsg, enable_channel, and disable_channel refer to a GPU-global channel
// RAM on pre-Ampere GPUs
if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_AMPERE)
chram_scope = rl_dir;
else
chram_scope = dir;
// Create file `/proc/gpu#/runlist#/preempt_tsg`, world writable
// On Turing and older, `/proc/gpu#/preempt_tsg`
if (!proc_create_data(
"preempt_tsg", 0222, chram_scope, compat_ops(&preempt_tsg_file_ops),
(void*)last_runlist))
goto out_nomem;
// Create file `/proc/gpu#/runlist#/disable_channel`, world writable
// On Turing and older, `/proc/gpu#/disable_channel`
if (!proc_create_data(
"disable_channel", 0222, chram_scope, compat_ops(&disable_channel_file_ops),
(void*)last_runlist))
goto out_nomem;
// Create file `/proc/gpu#/runlist#/enable_channel`, world writable
// On Turing and older, `/proc/gpu#/enable_channel`
if (!proc_create_data(
"enable_channel", 0222, chram_scope, compat_ops(&enable_channel_file_ops),
(void*)last_runlist))
goto out_nomem;
}
// Create file `/proc/gpu#/runlist#/runlist`, world readable
if (!proc_create_data(
"runlist", 0444, rl_dir, compat_ops(&runlist_file_ops),
(void*)last_runlist))
goto out_nomem;
// Create file `/proc/gpu#/runlist#/switch_to_tsg`, world writable
if (!proc_create_data(
"switch_to_tsg", 0222, rl_dir, compat_ops(&switch_to_tsg_file_ops),
(void*)last_runlist))
goto out_nomem;
} while (last_runlist-- > 0);
/* On the TU104, the context scheduler (contained in the Host, aka
* PFIFO, unit) has been observed to sometimes to fail to schedule TSGs
* containing re-enabled channels. Resubmitting the runlist
* configuration appears to remediate this condition, and so this API
* is exposed to help reset GPU scheduling as necessary.
*/
// Create file `/proc/gpu#/resubmit_runlist`, world writable
if (!proc_create_data(
"resubmit_runlist", 0222, dir, compat_ops(&resubmit_runlist_file_ops),
(void*)device_id))
goto out_nomem;
// Create file `/proc/gpu#/device_info`, world readable
if (!proc_create_data(
"device_info", 0444, dir, compat_ops(&device_info_file_ops),
(void*)device_id))
goto out_nomem;
// Create file `/proc/gpu#/num_gpcs`, world readable
if (!proc_create_data(
"num_gpcs", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
(void*)NV_PTOP_SCAL_NUM_GPCS))
goto out_nomem;
// Create file `/proc/gpu#/num_tpc_per_gpc`, world readable
if (!proc_create_data(
"num_tpc_per_gpc", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
(void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC))
goto out_nomem;
// Create file `/proc/gpu#/num_ces`, world readable
if (!proc_create_data(
"num_ces", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
(void*)NV_PTOP_SCAL_NUM_CES))
goto out_nomem;
// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable (Maxwell+)
if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL)
if ((err = create_tpc_mask_files(device_id, dir)))
goto out_err;
// Create file `/proc/gpu#/gpc_mask`, world readable (Maxwell+)
if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_AMPERE) {
if (!proc_create_data(
"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
(void*)NV_FUSE_GPC_GA100))
goto out_nomem;
} else if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL) {
if (!proc_create_data(
"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
(void*)NV_FUSE_GPC_GM107))
goto out_nomem;
}
// Create file `/proc/gpu#/local_memory`, world readable (Pascal+)
if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
if (!proc_create_data(
"local_memory", 0444, dir, compat_ops(&local_memory_file_ops),
(void*)0x00100ce0))
goto out_nomem;
}
// Create files exposing LCE and PCE configuration (Pascal+)
if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
// Create file `/proc/gpu#/copy_topology`, world readable
if (!proc_create_data(
"copy_topology", 0444, dir, compat_ops(©_topology_file_ops),
(void*)0))
goto out_nomem;
// Create file `/proc/gpu#/pce_map`, world readable
if (!proc_create_data(
"pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
(void*)NV_CE_PCE_MAP))
goto out_nomem;
}
}
// (See Makefile if you want to know the origin of GIT_HASH.)
printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n");
return 0;
out_nomem:
err = -ENOMEM;
out_err:
// Make sure to clear all ProcFS directories on error
while (res < g_nvdebug_devices) {
char device_id_str[7];
snprintf(device_id_str, 7, "gpu%d", res);
remove_proc_subtree(device_id_str, NULL);
res++;
}
return err;
}
static void __exit nvdebug_exit(void) {
struct nvdebug_state *g;
// Deinitialize each device
while (g_nvdebug_devices--) {
// Remove procfs directory
char device_id[7];
snprintf(device_id, 7, "gpu%d", g_nvdebug_devices);
remove_proc_subtree(device_id, NULL);
g = &g_nvdebug_state[g_nvdebug_devices];
// Free BAR mappings for PCIe devices
if (g && g->pcid) {
if (g->regs)
pci_iounmap(g->pcid, g->regs);
if (g->bar2)
pci_iounmap(g->pcid, g->bar2);
#if INTERRUPT_DEBUG
free_irq(g->pcid->irq, g->pcid);
#endif // INTERRUPT_DEBUG
} else {
if (g->regs)
iounmap(g->regs);
}
printk(KERN_INFO "[nvdebug] Chip ID %x deinitialized.", g->chip_id);
}
printk(KERN_INFO "[nvdebug] Module exit complete.\n");
}
module_init(nvdebug_init);
module_exit(nvdebug_exit);