/* Copyright 2024 Joshua Bakita * Helpers to deal with NVIDIA's MMU and associated page tables */ #include // ERR_PTR() etc. #include // iommu_get_domain_for_dev() and iommu_iova_to_phys() #include // Kernel types #include "nvdebug.h" /* Set logging level for MMU operations g_verbose >= 1: Log a single message describing the MMU operation g_verbose >= 2: Log every PDE and PTE traversed */ int g_verbose = 0; #define printk_debug if (g_verbose >= 2) printk #define printk_info if (g_verbose >= 1) printk /* Convert a page directory (PD) pointer and aperture to be kernel-accessible I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the AMDGPU driver. @param addr Pointer from page directory entry (PDE) @param pd_ap PD-type aperture (target address space) for `addr` @return A dereferencable kernel address, or an ERR_PTR-wrapped error */ static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, enum PD_TARGET pd_ap) { struct iommu_domain *dom; phys_addr_t phys; // Validate arguments if (unlikely(!IS_PD_TARGET(pd_ap) || pd_ap == PD_AND_TARGET_INVALID || !addr)) return ERR_PTR(-EINVAL); // VID_MEM accesses are the simple common-case if (pd_ap == PD_AND_TARGET_VID_MEM) { // Using BAR2 requires a page-table traversal. As this function is part // of the page-table traversal process, it must instead use PRAMIN. int off = addr_to_pramin_mut(g, addr, TARGET_VID_MEM); if (off < 0) return ERR_PTR(off); return g->regs + NV_PRAMIN + off; } /* SYS_MEM accesses are rare. Only nvgpu (Jetson driver), nouveau, and this * driver are known to create page directory entries in SYS_MEM. * * On systems using an I/O MMU, or some other I/O virtual address space, * these are **not** physical addresses, and must first be translated * through the I/O MMU before use. * Example default meaning of a SYS_MEM address for a few CPUs: * - Jetson Xavier : physical address * - AMD 3950X : I/O MMU address * - Phenom II x4 : physical address */ // Check for, and translate through, the I/O MMU (if any) if ((dom = iommu_get_domain_for_dev(g->dev))) { phys = iommu_iova_to_phys(dom, addr); printk_debug(KERN_DEBUG "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#lx to physical address %#llx.\n", addr, phys); } else phys = addr; if (!phys) return 0; return phys_to_virt(phys); } // Internal helper for search_page_directory(). uint64_t search_page_directory_subtree(struct nvdebug_state *g, uintptr_t pde_addr, enum PD_TARGET pde_target, uint64_t addr_to_find, enum INST_TARGET addr_to_find_aperture, uint32_t level) { uint64_t res, i; void __iomem *pde_kern; page_dir_entry_t entry; if (level > sizeof(NV_MMU_PT_V2_SZ)) return 0; // Hack to workaround PDE0 being double-size and strangely formatted if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16) pde_addr += 8; // Translate a VID_MEM/SYS_MEM-space address to something kernel-accessible pde_kern = pd_deref(g, pde_addr, pde_target); if (IS_ERR_OR_NULL(pde_kern)) { printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_addr, pd_target_to_text(pde_target), PTR_ERR(pde_kern)); return 0; } // Read the page directory entry (a pointer to another directory, or a PTE) entry.raw_w = readq(pde_kern); // If we reached an invalid (unpopulated) PDE, walk back up the tree if (entry.target == PD_AND_TARGET_INVALID) return 0; // Succeed when we reach a PTE with the address we want if (entry.is_pte) { // TODO: Handle huge pages here printk_debug(KERN_DEBUG "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w); return (uint64_t)entry.addr << 12 == addr_to_find && entry.aperture == addr_to_find_aperture; } printk_debug(KERN_DEBUG "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w); // Depth-first search of the page table for (i = 0; i < NV_MMU_PT_V2_SZ[level + 1]; i++) { uint64_t next = ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i; printk_debug(KERN_DEBUG "[nvdebug] Searching index %llu in lvl %d\n", i, level + 1); res = search_page_directory_subtree(g, next, entry.target, addr_to_find, addr_to_find_aperture, level + 1); if (res) return res | (i << NV_MMU_PT_V2_LSB[level + 1]); } return 0; } /* GPU Physical address -> Virtual address ("reverse" translation) for V2 tables Depth-first search a page directory of the GPU MMU for where a particular physical address is mapped. Upon finding a mapping, the virtual address is returned. The page directory and tables may be located in VID_MEM, SYS_MEM, or spread across multiple apertures. @param pd_config Page Directory configuration, containing pointer and aperture for the start of the PDE3 entries @param addr_to_find Physical address to reconstruct the virtual address of @param addr_to_find_aperture Aperture (SYS_MEM or VID_MEM) of addr_to_find @return 0 on error, otherwise the virtual address at which addr_to_find is mapped into by this page table. (Zero is not a valid virtual address) */ uint64_t search_page_directory(struct nvdebug_state *g, page_dir_config_t pd_config, uint64_t addr_to_find, enum INST_TARGET addr_to_find_aperture) { uint64_t res, i; // Make sure that the query is page-aligned if (addr_to_find & 0xfff) { printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find); return 0; } printk_info(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018lx\n", addr_to_find, (uintptr_t)pd_config.page_dir << 12); // Search the top-level page directory (PDE3) for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++) if ((res = search_page_directory_subtree(g, ((uintptr_t)pd_config.page_dir << 12) + NV_MMU_PT_V2_ENTRY_SZ[0] * i, INST2PD_TARGET(pd_config.target), addr_to_find, addr_to_find_aperture, 0))) return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]); return 0; } /* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables (See `search_page_directory()` for documentation.) */ uint64_t search_v1_page_directory(struct nvdebug_state *g, page_dir_config_t pd_config, uint64_t addr_to_find, enum INST_TARGET addr_to_find_aperture) { uint64_t j, i = 0; page_dir_entry_v1_t pde; page_tbl_entry_v1_t pte; uintptr_t pte_offset, pde_offset; void __iomem *pte_addr, *pde_addr; // This function only understands the Page Table Version 1 format if (pd_config.is_ver2) { printk(KERN_ERR "[nvdebug] Passed a Version 2 page table at %#018llx to translate_v1_page_directory()!\n", (uint64_t)pd_config.page_dir << 12); return 0; } // We only understand the Version 1 format when 128 KiB huge pages are in-use if (pd_config.is_64k_big_page) { printk(KERN_ERR "[nvdebug] Page Table Version 1 with 64 KiB huge pages is unsupported!\n"); return 0; } printk_info(KERN_INFO "[nvdebug] Searching V1 page table at %#018lx in %s for addr %#018llx\n", (uintptr_t)pd_config.page_dir << 12, target_to_text(pd_config.target), addr_to_find); // For each PDE do { // Index the list of page directory entries pde_offset = ((uint64_t)pd_config.page_dir << 12) + i * sizeof(page_dir_entry_v1_t); // Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr pde_addr = pd_deref(g, pde_offset, INST2PD_TARGET(pd_config.target)); if (IS_ERR_OR_NULL(pde_addr)) { printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_offset, pd_target_to_text(INST2PD_TARGET(pd_config.target)), -PTR_ERR(pde_addr)); return 0; } // readq doesn't seem to work on BAR0 pde.raw = readl(pde_addr + 4); pde.raw <<= 32; pde.raw |= readl(pde_addr); // Verify PDE is present if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID) continue; // printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw); printk_debug(KERN_DEBUG "[nvdebug] Found %s PDE at index %lld pointing to PTEs @ %#018llx in ap '%d' (raw: %#018llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", i, ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw); // For each PTE for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) { // Index the list of page table entries starting at pde.alt_addr pte_offset = ((uint64_t)pde.alt_addr << 12) + j * sizeof(page_tbl_entry_v1_t); // Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr pte_addr = pd_deref(g, pte_offset, V12PD_TARGET(pde.alt_target)); if (IS_ERR_OR_NULL(pte_addr)) { printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pte_offset, pd_target_to_text(V12PD_TARGET(pde.alt_target)), -PTR_ERR(pte_addr)); return 0; } // Read page table entry, avoiding readq pte.raw = readl(pte_addr + 4); pte.raw <<= 32; pte.raw |= readl(pte_addr); // Skip non-present PTEs if (!pte.is_present) continue; printk_debug(KERN_DEBUG "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)pte.addr) << 12, target_to_text(pte.target), pte.is_volatile, pte.is_privileged, pte.is_readonly, pte.atomics_disabled, pte.raw); // If we find a matching PTE, return its virtual address if ((uint64_t)pte.addr << 12 == addr_to_find && pte.target == addr_to_find_aperture) return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1]; } } while (++i < NV_MMU_PT_V1_SZ[0]); return 0; } /* *** UNTESTED *** #define NV_MMU_PT_V0_SZ 2048 #define NV_MMU_PT_V0_LSB 29 uint64_t search_v0_page_directory(struct nvdebug_state *g, void __iomem *pde_offset, void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t), uint32_t addr_to_find) { int j, i = 0; page_dir_entry_v0_t pde; page_tbl_entry_v0_t pte; void __iomem *pte_offset; // For each PDE do { // readq doesn't seem to work on BAR0 pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4); pde.raw <<= 32; pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t)); //if (pde.raw) //printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw); // Skip unpopulated PDEs if (pde.type == NOT_PRESENT) continue; //printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12); pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12); // For each PTE for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) { pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4); pte.raw <<= 32; pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t)); // Skip non-present PTEs if (!pte.is_present) continue; // If we find a matching PTE, return its virtual address //if (pte.addr != 0x5555555) // printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present"); if (pte.addr << 12 == addr_to_find) return i << NV_MMU_PT_V0_LSB | j << 12; } } while (++i < NV_MMU_PT_V0_SZ); return 0; // No match } */