/* Copyright 2024 Joshua Bakita * SPDX-License-Identifier: MIT */ #include // For struct device, bus_find_device*(), struct bus_type #include // For hooking the nvidia driver interrupts #include #include #include // For PCI device scanning #include // So we can set up entries in /proc #include "nvdebug_linux.h" #include "stubs.h" // Enable to intercept and log GPU interrupts. Historically used to benchmark // interrupt latency. #define INTERRUPT_DEBUG 0 // MIT is GPL-compatible. We need to be GPL-compatible for symbols like // platform_bus_type or bus_find_device_by_name... MODULE_LICENSE("Dual MIT/GPL"); MODULE_AUTHOR("Joshua Bakita"); MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs"); // runlist_procfs.c extern struct file_operations runlist_file_ops; extern struct file_operations preempt_tsg_file_ops; extern struct file_operations disable_channel_file_ops; extern struct file_operations enable_channel_file_ops; extern struct file_operations resubmit_runlist_file_ops; extern struct file_operations switch_to_tsg_file_ops; // device_info_procfs.c extern struct file_operations device_info_file_ops; extern struct file_operations nvdebug_read_reg32_file_ops; extern struct file_operations nvdebug_read_reg_range_file_ops; extern struct file_operations local_memory_file_ops; // copy_topology_procfs.c extern struct file_operations copy_topology_file_ops; struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES]; unsigned int g_nvdebug_devices = 0; // Bus types are global symbols in the kernel extern struct bus_type platform_bus_type; // Starting in Kernel 5.6, proc_ops is required instead of file_operations. // As file_operations is larger than proc_ops, we can overwrite the memory // backing the file_operations struct to follow the proc_ops layout, and then // cast on newer kernels. // We use the last byte of the file_operations struct to flag that the memory // layout has been rearranged. #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0) const struct proc_ops* compat_ops(const struct file_operations* ops) { struct proc_ops new_ops = {}; // Don't re-layout if it's already been done if (*((uint8_t*)(ops + 1) - 1)) return (struct proc_ops*)ops; new_ops.proc_open = ops->open; new_ops.proc_read = ops->read; new_ops.proc_write = ops->write; new_ops.proc_lseek = ops->llseek; new_ops.proc_release = ops->release; memcpy((void*)ops, &new_ops, sizeof(new_ops)); // Flag re-layout as complete in last byte of structure *((uint8_t*)(ops + 1) - 1) = 1; return (struct proc_ops*)ops; } #else const struct file_operations* compat_ops(const struct file_operations* ops) { return ops; } #endif #if INTERRUPT_DEBUG irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) { printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num); return IRQ_NONE; // We don't actually handle any interrupts. Pass them on. } #endif // INTERRUPT_DEBUG // Find any and all NVIDIA GPUs in the system // Note: This function fails if any of them are in a bad state int probe_and_cache_devices(void) { // platform bus (SoC) iterators struct device *dev = NULL; struct device *temp_dev; // PCI search iterator and search query struct pci_dev *pcid = NULL; // This query pattern is mirrored off nouveau struct pci_device_id query = { .vendor = NV_PCI_VENDOR, // Match NVIDIA devices .device = PCI_ANY_ID, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, .class_mask = 0xff << 16, .class = PCI_BASE_CLASS_DISPLAY << 16, // Match display devs }; int i = 0; // Search the platform bus for the first device that matches our name // Search for embedded GPU on Jetson (generic name starting around L4T 36.3) while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gpu"))) dev = temp_dev; // Search for GA10B (Jetson Orin) while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.ga10b"))) dev = temp_dev; // Search for GV11B (Jetson Xavier) while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b"))) dev = temp_dev; // Search for GP10B (Jetson TX2) while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gp10b"))) dev = temp_dev; // Search for GM20B (Jetson TX1) while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "57000000.gpu"))) dev = temp_dev; // TODO: Support other platform bus devices (gk20a - TK1) if (dev) { mc_boot_0_t ids; g_nvdebug_state[i].g = get_gk20a(dev); g_nvdebug_state[i].regs = gk20a_regs(g_nvdebug_state[i].g); if (!g_nvdebug_state[i].regs) return -EADDRNOTAVAIL; ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0); if (ids.raw == -1) return -EADDRNOTAVAIL; g_nvdebug_state[i].chip_id = ids.chip_id; g_nvdebug_state[i].bar3 = NULL; g_nvdebug_state[i].pcid = NULL; g_nvdebug_state[i].dev = dev; printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on platform bus and initialized.", ids.chip_id, ARCH2NAME(ids.architecture)); i++; } // Search the PCI bus and iterate through all matches // FIXME: Undo the pci_iomap() if this fails while ((pcid = pci_get_dev_by_id(&query, pcid)) && i < NVDEBUG_MAX_DEVICES) { mc_boot_0_t ids; g_nvdebug_state[i].g = NULL; // Map BAR0 (GPU control registers) g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0); if (!g_nvdebug_state[i].regs) { pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n"); return -EADDRNOTAVAIL; } ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0); if (ids.raw == -1) { pci_err(pcid, "[nvdebug] Unable to read config from Master Controller on this GPU\n"); return -EADDRNOTAVAIL; } g_nvdebug_state[i].chip_id = ids.chip_id; // Map BAR3 (CPU-accessible mappings of GPU DRAM) g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, 0); // XXX: Try mapping only the lower half of BAR3 on fail // (vesafb may map the top half for display) if (!g_nvdebug_state[i].bar3) g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2); g_nvdebug_state[i].pcid = pcid; g_nvdebug_state[i].dev = &pcid->dev; printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.", ids.chip_id, ARCH2NAME(ids.architecture)); #if INTERRUPT_DEBUG if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) { printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n"); } #endif // INTERRUPT_DEBUG i++; } // Return the number of devices found if (i > 0) return i; return -ENODEV; } // Support: Fermi, Maxwell, Pascal, Volta, Turing int get_last_runlist_id_gk104(struct nvdebug_state *g) { ptop_device_info_gk104_t info; int i, max_rl_id = 0; // Always at least one runlist // Figure out how many runlists there are by checking the device info // registers. Runlists are always numbered sequentially, so we just have // to find the highest-valued one and add 1 to get the number of runlists. for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_GK104; i++) { if ((info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GK104(i))) == -1) return -EIO; if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid) continue; if (info.runlist_enum > max_rl_id) max_rl_id = info.runlist_enum; } return max_rl_id; } // Support: Ampere, Hopper, Ada (and newer likely) // Identical structure to get_runlist_ram() in runlist.c. See comments there. int get_last_runlist_id_ga100(struct nvdebug_state *g) { ptop_device_info_ga100_t ptop_entry; int i, runlist_count = 0; int ptop_size = NV_PTOP_DEVICE_INFO__SIZE_1_GA100(g); int ptop_entry_subrow = 0; for (i = 0; i < ptop_size; i++) { if ((ptop_entry.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GA100(i))) == -1) return -EIO; if (!ptop_entry.raw) continue; if (ptop_entry_subrow == 2 && ptop_entry.rleng_id == 0) runlist_count++; if (ptop_entry.has_next_entry) ptop_entry_subrow += 1; else ptop_entry_subrow = 0; } return runlist_count - 1; } // Return the maximum runlist ID. For a two-runlist GPU, this would return 1. int get_last_runlist_id(int device_id) { struct nvdebug_state* g = &g_nvdebug_state[device_id]; if (g->chip_id >= NV_CHIP_ID_AMPERE) return get_last_runlist_id_ga100(g); else return get_last_runlist_id_gk104(g); } // Create files `/proc/gpu#/gpc#_tpc_mask`, world readable // Support: Maxwell+ int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) { struct nvdebug_state* g = &g_nvdebug_state[device_id]; char file_name[20]; int i; struct proc_dir_entry *gpc_tpc_mask_entry; // Get maximum number of enabled GPCs for this chip uint32_t max_gpcs = nvdebug_readl(g, NV_PTOP_SCAL_NUM_GPCS); // Get a bitmask of which GPCs are disabled uint32_t gpcs_mask; if (g->chip_id < NV_CHIP_ID_AMPERE) gpcs_mask = nvdebug_readl(g, NV_FUSE_GPC_GM107); else gpcs_mask = nvdebug_readl(g, NV_FUSE_GPC_GA100); // Verify the reads succeeded if (max_gpcs == -1 || gpcs_mask == -1) return -EIO; // For each enabled GPC, expose a mask of disabled TPCs for (i = 0; i < max_gpcs; i++) { // Do nothing if GPC is disabled if ((1 << i) & gpcs_mask) continue; // If GPC is enabled, create an entry to read disabled TPCs mask snprintf(file_name, 20, "gpc%d_tpc_mask", i); if (g->chip_id < NV_CHIP_ID_AMPERE) gpc_tpc_mask_entry = proc_create_data( file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), (void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC_GM107(i)); else gpc_tpc_mask_entry = proc_create_data( file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), (void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC_GA100(i)); if (!gpc_tpc_mask_entry) return -ENOMEM; } return 0; } int __init nvdebug_init(void) { struct proc_dir_entry *dir; int err, res; // Check that an NVIDIA GPU is present and initialize g_nvdebug_state if ((res = probe_and_cache_devices()) < 0) return res; g_nvdebug_devices = res; // Create seperate ProcFS directories for each gpu while (res--) { uintptr_t last_runlist = 0; char device_id_str[7]; // Create a wider copy of the GPU ID to allow us to abuse the *data // field of proc_dir_entry to store the GPU ID. uintptr_t device_id = res; // Create directory /proc/gpu# where # is the GPU number // As ProcFS entry creation only fails if out of memory, we auto-skip // to handling that on any error in creating ProcFS files. snprintf(device_id_str, 7, "gpu%ld", device_id); if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id))) goto out_nomem; // Create files in the `/proc/gpu#/runlist#/` directory // The read handling code looks at the `pde_data` associated with the parent // directory to determine what the runlist ID is. if ((last_runlist = get_last_runlist_id(device_id)) < 0) return last_runlist; do { char runlist_name[12]; struct proc_dir_entry *rl_dir; // Create `/proc/gpu#/runlist#` directory snprintf(runlist_name, 12, "runlist%lu", last_runlist); if (!(rl_dir = proc_mkdir_data(runlist_name, 0555, dir, (void*)device_id))) goto out_nomem; // Create file `/proc/gpu#/runlist#/runlist`, world readable if (!proc_create_data( "runlist", 0444, rl_dir, compat_ops(&runlist_file_ops), (void*)last_runlist)) goto out_nomem; } while (last_runlist-- > 0); // Create file `/proc/gpu#/preempt_tsg`, world writable if (!proc_create_data( "preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops), (void*)device_id)) goto out_nomem; /* On the TU104, the context scheduler (contained in the Host, aka * PFIFO, unit) has been observed to sometimes to fail to schedule TSGs * containing re-enabled channels. Resubmitting the runlist * configuration appears to remediate this condition, and so this API * is exposed to help reset GPU scheduling as necessary. */ // Create file `/proc/gpu#/resubmit_runlist`, world writable if (!proc_create_data( "resubmit_runlist", 0222, dir, compat_ops(&resubmit_runlist_file_ops), (void*)device_id)) goto out_nomem; // Create file `/proc/gpu#/disable_channel`, world writable if (!proc_create_data( "disable_channel", 0222, dir, compat_ops(&disable_channel_file_ops), (void*)device_id)) goto out_nomem; // Create file `/proc/gpu#/enable_channel`, world writable if (!proc_create_data( "enable_channel", 0222, dir, compat_ops(&enable_channel_file_ops), (void*)device_id)) goto out_nomem; // Create file `/proc/gpu#/switch_to_tsg`, world writable if (!proc_create_data( "switch_to_tsg", 0222, dir, compat_ops(&switch_to_tsg_file_ops), (void*)device_id)) goto out_nomem; // Create file `/proc/gpu#/device_info`, world readable if (!proc_create_data( "device_info", 0444, dir, compat_ops(&device_info_file_ops), (void*)device_id)) goto out_nomem; // Create file `/proc/gpu#/num_gpcs`, world readable if (!proc_create_data( "num_gpcs", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), (void*)NV_PTOP_SCAL_NUM_GPCS)) goto out_nomem; // Create file `/proc/gpu#/num_tpc_per_gpc`, world readable if (!proc_create_data( "num_tpc_per_gpc", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), (void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC)) goto out_nomem; // Create file `/proc/gpu#/num_ces`, world readable if (!proc_create_data( "num_ces", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), (void*)NV_PTOP_SCAL_NUM_CES)) goto out_nomem; // Create files `/proc/gpu#/gpc#_tpc_mask`, world readable (Maxwell+) if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL) if ((err = create_tpc_mask_files(device_id, dir))) goto out_err; // Create file `/proc/gpu#/gpc_mask`, world readable (Maxwell+) if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_AMPERE) { if (!proc_create_data( "gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), (void*)NV_FUSE_GPC_GA100)) goto out_nomem; } else if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL) { if (!proc_create_data( "gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), (void*)NV_FUSE_GPC_GM107)) goto out_nomem; } // Create file `/proc/gpu#/local_memory`, world readable (Pascal+) if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) { if (!proc_create_data( "local_memory", 0444, dir, compat_ops(&local_memory_file_ops), (void*)0x00100ce0)) goto out_nomem; } // Create files exposing LCE and PCE configuration (Pascal+) if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) { // Create file `/proc/gpu#/copy_topology`, world readable if (!proc_create_data( "copy_topology", 0444, dir, compat_ops(©_topology_file_ops), (void*)0)) goto out_nomem; // Create file `/proc/gpu#/pce_map`, world readable if (!proc_create_data( "pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), (void*)NV_CE_PCE_MAP)) goto out_nomem; } } // (See Makefile if you want to know the origin of GIT_HASH.) printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n"); return 0; out_nomem: err = -ENOMEM; out_err: // Make sure to clear all ProcFS directories on error while (res < g_nvdebug_devices) { char device_id_str[7]; snprintf(device_id_str, 7, "gpu%d", res); remove_proc_subtree(device_id_str, NULL); res++; } return err; } static void __exit nvdebug_exit(void) { struct nvdebug_state *g; // Deinitialize each device while (g_nvdebug_devices--) { // Remove procfs directory char device_id[7]; snprintf(device_id, 7, "gpu%d", g_nvdebug_devices); remove_proc_subtree(device_id, NULL); g = &g_nvdebug_state[g_nvdebug_devices]; // Free BAR mappings for PCIe devices if (g && g->pcid) { if (g->regs) pci_iounmap(g->pcid, g->regs); if (g->bar2) pci_iounmap(g->pcid, g->bar2); #if INTERRUPT_DEBUG free_irq(g->pcid->irq, g->pcid); #endif // INTERRUPT_DEBUG } printk(KERN_INFO "[nvdebug] Chip ID %x deinitialized.", g->chip_id); } printk(KERN_INFO "[nvdebug] Module exit complete.\n"); } module_init(nvdebug_init); module_exit(nvdebug_exit);