/* Copyright 2024 Joshua Bakita
* Helpers to deal with NVIDIA's MMU and associated page tables
*/
#include <linux/err.h> // ERR_PTR() etc.
#include <linux/iommu.h> // iommu_get_domain_for_dev() and iommu_iova_to_phys()
#include <linux/kernel.h> // Kernel types
#include "nvdebug.h"
// Uncomment to print every PDE and PTE walked for debugging
//#define DEBUG
#ifdef DEBUG
#define printk_debug printk
#else
#define printk_debug(...)
#endif
/* Convert a page directory (PD) pointer and aperture to be kernel-accessible
I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the
AMDGPU driver.
@param addr Pointer from page directory entry (PDE)
@param pd_ap PD-type aperture (target address space) for `addr`
@return A dereferencable kernel address, or an ERR_PTR-wrapped error
*/
void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, enum PD_TARGET pd_ap) {
struct iommu_domain *dom;
phys_addr_t phys;
// Validate arguments
if (unlikely(!IS_PD_TARGET(pd_ap) || pd_ap == PD_AND_TARGET_INVALID || !addr))
return ERR_PTR(-EINVAL);
// VID_MEM accesses are the simple common-case
if (pd_ap == PD_AND_TARGET_VID_MEM) {
// Using BAR2 requires a page-table traversal. As this function is part
// of the page-table traversal process, it must instead use PRAMIN.
int off = addr_to_pramin_mut(g, addr, TARGET_VID_MEM);
if (off < 0)
return ERR_PTR(off);
return g->regs + NV_PRAMIN + off;
}
/* SYS_MEM accesses are rare. Only nvgpu (Jetson driver), nouveau, and this
* driver are known to create page directory entries in SYS_MEM.
*
* On systems using an I/O MMU, or some other I/O virtual address space,
* these are **not** physical addresses, and must first be translated
* through the I/O MMU before use.
* Example default meaning of a SYS_MEM address for a few CPUs:
* - Jetson Xavier : physical address
* - AMD 3950X : I/O MMU address
* - Phenom II x4 : physical address
*/
// Check for, and translate through, the I/O MMU (if any)
if ((dom = iommu_get_domain_for_dev(g->dev))) {
phys = iommu_iova_to_phys(dom, addr);
printk(KERN_ERR "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#lx to physical address %llx.\n", addr, phys);
} else
phys = addr;
if (!phys)
return 0;
return phys_to_virt(phys);
}
// Internal helper for search_page_directory().
uint64_t search_page_directory_subtree(struct nvdebug_state *g,
uintptr_t pde_addr,
enum PD_TARGET pde_target,
uint64_t addr_to_find,
enum INST_TARGET addr_to_find_aperture,
uint32_t level) {
uint64_t res, i;
void __iomem *pde_kern;
page_dir_entry_t entry;
if (level > sizeof(NV_MMU_PT_V2_SZ))
return 0;
// Hack to workaround PDE0 being double-size and strangely formatted
if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
pde_addr += 8;
// Translate a VID_MEM/SYS_MEM-space address to something kernel-accessible
pde_kern = pd_deref(g, pde_addr, pde_target);
if (IS_ERR_OR_NULL(pde_kern)) {
printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_addr, pd_target_to_text(pde_target), PTR_ERR(pde_kern));
return 0;
}
// Read the page directory entry (a pointer to another directory, or a PTE)
entry.raw_w = readq(pde_kern);
// If we reached an invalid (unpopulated) PDE, walk back up the tree
if (entry.target == PD_AND_TARGET_INVALID)
return 0;
// Succeed when we reach a PTE with the address we want
if (entry.is_pte) {
// TODO: Handle huge pages here
printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w);
return (uint64_t)entry.addr << 12 == addr_to_find && entry.aperture == addr_to_find_aperture;
}
printk_debug(KERN_INFO "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w);
// Depth-first search of the page table
for (i = 0; i < NV_MMU_PT_V2_SZ[level + 1]; i++) {
uint64_t next = ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i;
res = search_page_directory_subtree(g, next, entry.target, addr_to_find, addr_to_find_aperture, level + 1);
if (res)
return res | (i << NV_MMU_PT_V2_LSB[level + 1]);
}
return 0;
}
/* GPU Physical address -> Virtual address ("reverse" translation) for V2 tables
Depth-first search a page directory of the GPU MMU for where a particular
physical address is mapped. Upon finding a mapping, the virtual address is
returned.
The page directory may be located in VID_MEM, SYS_MEM, or some combination of
the two.
@param pd_config Page Directory configuration, containing pointer and
aperture for the start of the PDE3 entries
@param addr_to_find Physical address to reconstruct the virtual address of
@param addr_to_find_aperture Aperture (SYS_MEM or VID_MEM) of addr_to_find
@return 0 on error, otherwise the virtual address at which addr_to_find is
mapped into by this page table. (Zero is not a valid virtual address)
*/
uint64_t search_page_directory(struct nvdebug_state *g,
page_dir_config_t pd_config,
uint64_t addr_to_find,
enum INST_TARGET addr_to_find_aperture) {
uint64_t res, i;
// Make sure that the query is page-aligned
if (addr_to_find & 0xfff) {
printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find);
return 0;
}
printk(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018lx\n", addr_to_find, (uintptr_t)pd_config.page_dir << 12);
// Search the top-level page directory (PDE3)
for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++)
if ((res = search_page_directory_subtree(g, ((uintptr_t)pd_config.page_dir << 12) + NV_MMU_PT_V2_ENTRY_SZ[0] * i, INST2PD_TARGET(pd_config.target), addr_to_find, addr_to_find_aperture, 0)))
return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]);
return 0;
}
/* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables
(See `search_page_directory()` for documentation.)
*/
uint64_t search_v1_page_directory(struct nvdebug_state *g,
page_dir_config_t pd_config,
uint64_t addr_to_find,
enum INST_TARGET addr_to_find_aperture) {
uint64_t j, i = 0;
page_dir_entry_v1_t pde;
page_tbl_entry_v1_t pte;
uintptr_t pte_offset, pde_offset;
void __iomem *pte_addr, *pde_addr;
// For each PDE
do {
// Index the list of page directory entries
pde_offset = ((uint64_t)pd_config.page_dir << 12) + i * sizeof(page_dir_entry_v1_t);
// Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
pde_addr = pd_deref(g, pde_offset, INST2PD_TARGET(pd_config.target));
if (IS_ERR_OR_NULL(pde_addr)) {
printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_offset, pd_target_to_text(INST2PD_TARGET(pd_config.target)), -PTR_ERR(pde_addr));
return 0;
}
// readq doesn't seem to work on BAR0
pde.raw = readl(pde_addr + 4);
pde.raw <<= 32;
pde.raw |= readl(pde_addr);
// Verify PDE is present
if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
continue;
// printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw);
printk_debug(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw);
// For each PTE
for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
// Index the list of page table entries starting at pde.alt_addr
pte_offset = ((uint64_t)pde.alt_addr << 12) + j * sizeof(page_tbl_entry_v1_t);
// Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
pte_addr = pd_deref(g, pte_offset, V12PD_TARGET(pde.alt_target));
if (IS_ERR_OR_NULL(pte_addr)) {
printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pte_offset, pd_target_to_text(V12PD_TARGET(pde.alt_target)), -PTR_ERR(pte_addr));
return 0;
}
// Read page table entry, avoiding readq
pte.raw = readl(pte_addr + 4);
pte.raw <<= 32;
pte.raw |= readl(pte_addr);
// Skip non-present PTEs
if (!pte.is_present)
continue;
printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw);
// If we find a matching PTE, return its virtual address
if ((uint64_t)pte.addr << 12 == addr_to_find && pte.target == addr_to_find_aperture)
return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1];
}
} while (++i < NV_MMU_PT_V1_SZ[0]);
return 0;
}
/* *** UNTESTED ***
#define NV_MMU_PT_V0_SZ 2048
#define NV_MMU_PT_V0_LSB 29
uint64_t search_v0_page_directory(struct nvdebug_state *g,
void __iomem *pde_offset,
void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t),
uint32_t addr_to_find) {
int j, i = 0;
page_dir_entry_v0_t pde;
page_tbl_entry_v0_t pte;
void __iomem *pte_offset;
// For each PDE
do {
// readq doesn't seem to work on BAR0
pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4);
pde.raw <<= 32;
pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t));
//if (pde.raw)
//printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw);
// Skip unpopulated PDEs
if (pde.type == NOT_PRESENT)
continue;
//printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12);
pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12);
// For each PTE
for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) {
pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4);
pte.raw <<= 32;
pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t));
// Skip non-present PTEs
if (!pte.is_present)
continue;
// If we find a matching PTE, return its virtual address
//if (pte.addr != 0x5555555)
// printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present");
if (pte.addr << 12 == addr_to_find)
return i << NV_MMU_PT_V0_LSB | j << 12;
}
} while (++i < NV_MMU_PT_V0_SZ);
return 0; // No match
}
*/