// Helpers to deal with NVIDIA's MMU and associated page tables #include // Kernel types #include "nvdebug.h" /* One of the oldest ways to access video memory on NVIDIA GPUs is by using a configurable 1MB window into VRAM which is mapped into BAR0 (register) space starting at offset NV_PRAMIN. This is still supported on NVIDIA GPUs and appear to be used today to bootstrap page table configuration. Why is it mapped at a location called NVIDIA Private RAM Instance? Because this used to point to the entirety of intance RAM, which was seperate from VRAM on older NVIDIA GPUs. */ /* Convert a physical VRAM address to an offset in the PRAMIN window @param addr VRAM address to convert @return 0 on error, PRAMIN offset on success Note: Use off2PRAMIN() instead if you want a dereferenceable address */ uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr) { uint64_t pramin_base_va; bar0_window_t window; window.raw = nvdebug_readl(g, NV_PBUS_BAR0_WINDOW); // Check if the address is valid (49 bits are addressable on-GPU) if (addr & ~0x0001ffffffffffff) { printk(KERN_ERR "[nvdebug] Invalid address %llx passed to %s!\n", addr, __func__); return 0; } // For unclear (debugging?) reasons, PRAMIN can point to SYSMEM if (window.target != TARGET_VID_MEM) return 0; pramin_base_va = ((uint64_t)window.base) << 16; // Protect against out-of-bounds accesses if (addr < pramin_base_va || addr > pramin_base_va + NV_PRAMIN_LEN) return 0; return addr - pramin_base_va; } /* NVIDIA GMMU (GPU Memory Management Unit) uses page tables that are mostly straight-forward starting with Pascal ("page table version 2"), except for a few quirks (like 16-byte PDE0 entries, but all other entries are 8 bytes). All you really need to know is that any given Page Directory Entry (PDE) contains a pointer to the start of a 4k page densely filled with PDEs or Page Table Entries (PTEs). == Page Table Refresher == Page tables convert virtual addresses to physical addresses, and they do this via a tree structure. Leafs (PTEs) contain a physical address, and the path from root to leaf is defined by the virtual address. Non-leaf nodes are PDEs. When decending, the virtual address is sliced into pieces, and one slice is used at each level (as an index) to select the next-visited node (in level+1). V2 of NVIDIA's page table format uses 4 levels of PDEs and a final level of PTEs. How the virtual address is sliced to yield an index into each level and a page offset is shown by Fig 1. == Figure 1 == Page Offset (12 bits) <---------------------------------------+ Page Table Entry (PTE) (9 bits) <--------------------+ | Page Directory Entry (PDE) 0 (8 bits) <-----+ | | PDE1 (8 bits) <--------------------+ | | | PDE2 (8 bits) <-----------+ | | | | PDE3 (2 bits) <--+ | | | | | ^ ^ ^ ^ ^ ^ Virtual addr: [49, 47] [46, 38] [37, 29] [28, 21] [20, 12] [11, 0] The following arrays merely represent different projections of Fig. 1, and only one is strictly needed to reconstruct all the others. However, due to the complexity of page tables, we include all of these to aid in readability. */ // How many nodes/entries per level in V2 of NVIDIA's page table format static const int NV_MMU_PT_V2_SZ[5] = {4, 512, 512, 256, 512}; // Size in bytes of an entry at a particular level static const int NV_MMU_PT_V2_ENTRY_SZ[5] = {8, 8, 8, 16, 8}; // Which bit index is the least significant in indexing each page level static const int NV_MMU_PT_V2_LSB[5] = {47, 38, 29, 21, 12}; // Convert a GPU physical address to CPU virtual address via the PRAMIN window void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy) { return g->regs + NV_PRAMIN + vram2PRAMIN(g, phy); } /* FIXME void __iomem *off2BAR2(struct nvdebug_state* g, uint32_t off) { return g->bar2 + off; } */ uint64_t search_page_directory_subtree(struct nvdebug_state *g, void __iomem *pde_offset, void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), uint64_t addr_to_find, uint32_t level) { uint64_t res, i; void __iomem *next; page_dir_entry_t entry; if (level > sizeof(NV_MMU_PT_V2_SZ)) return 0; // Hack to workaround PDE0 being double-size and strangely formatted if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16) pde_offset += 8; entry.raw = readl(pde_offset); // If we reached an invalid (unpopulated) PDE, walk back up the tree if (entry.target == PD_AND_TARGET_INVALID) return 0; // Succeed when we reach a PTE with the address we want if (entry.is_pte) { printk(KERN_INFO "[nvdebug] PTE for phy addr %llx (raw: %x)\n", ((u64)entry.addr) << 12, entry.raw); return (uint64_t)entry.addr << 12 == addr_to_find; } printk(KERN_INFO "[nvdebug] Found PDE pointing to %llx in ap '%d' at lvl %d (raw: %x)\n", ((u64)entry.addr) << 12, entry.target, level, entry.raw); // Depth-first search of the page table for (i = 0; i < NV_MMU_PT_V2_SZ[level]; i++) { next = off2addr(g, ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i); // off2addr can fail if (!next) { printk(KERN_ERR "[nvdebug] %s: Unable to resolve GPU PA to CPU PA\n", __func__); return 0; } res = search_page_directory_subtree(g, next, off2addr, addr_to_find, level + 1); if (res) return res | (i << NV_MMU_PT_V2_LSB[level + 1]); } return 0; } /* Search a page directory of the GPU MMU @param pde_offset Dereferenceable pointer to the start of the PDE3 entries @param off2addr Func to converts VRAM phys addresses to valid CPU VAs @param addr_to_find Physical address to reconstruct the virtual address of @return 0 on error, otherwise the virtual address at which addr_to_find is mapped into by this page table. */ uint64_t search_page_directory(struct nvdebug_state *g, void __iomem *pde_offset, void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), uint64_t addr_to_find) { uint64_t res, i; // Make sure that the query is page-aligned if (addr_to_find & 0xfff) { printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find); return 0; } // Search the top-level page directory (PDE3) for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++) if ((res = search_page_directory_subtree(g, pde_offset + NV_MMU_PT_V2_ENTRY_SZ[0] * i, off2addr, addr_to_find, 0))) return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]); return 0; } /* GMMU Page Tables Version 1 This page table only contains 2 levels and is used in the Fermi, Kepler, and Maxwell architectures */ // Number of entries in the PDE and PTE levels static const int NV_MMU_PT_V1_SZ[2] = {512, 1<<13}; // 2<<13 is an educated guess!!! // Which bit index is the least significant in indexing each page level static const int NV_MMU_PT_V1_LSB[2] = {25, 12}; // 25 is an educated guess!!! uint64_t search_v1_page_directory(struct nvdebug_state *g, void __iomem *pde_offset, void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), uint64_t addr_to_find) { uint64_t j, i = 0; page_dir_entry_v1_t pde; page_tbl_entry_v1_t pte; void __iomem *pte_offset; // For each PDE do { // readq doesn't seem to work on BAR0 pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v1_t) + 4); pde.raw <<= 32; pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v1_t)); // Verify PDE is present if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID) continue; // Convert to a dereferencable pointer from CPU virtual address space pte_offset = off2addr(g, (uint64_t)pde.alt_addr << 12); if (!pte_offset) continue; // printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw); // printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.target, pde.raw); // For each PTE for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) { // Don't overrun the PRAMIN window if (pte_offset > NV_PRAMIN + g->regs + NV_PRAMIN_LEN) return 0; pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v1_t) + 4); pte.raw <<= 32; pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v1_t)); // Skip non-present PTEs if (!pte.is_present) continue; // printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw); // If we find a matching PTE, return its virtual address if ((uint64_t)pte.addr << 12 == addr_to_find) return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1]; } } while (++i < NV_MMU_PT_V1_SZ[0]); return 0; } /* GMMU Page Tables Version 0 This page table only contains 2 levels and is used in the Tesla architecture */ /* *** UNTESTED *** #define NV_MMU_PT_V0_SZ 2048 #define NV_MMU_PT_V0_LSB 29 uint64_t search_v0_page_directory(struct nvdebug_state *g, void __iomem *pde_offset, void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t), uint32_t addr_to_find) { int j, i = 0; page_dir_entry_v0_t pde; page_tbl_entry_v0_t pte; void __iomem *pte_offset; // For each PDE do { // readq doesn't seem to work on BAR0 pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4); pde.raw <<= 32; pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t)); //if (pde.raw) //printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw); // Skip unpopulated PDEs if (pde.type == NOT_PRESENT) continue; //printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12); pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12); // For each PTE for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) { pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4); pte.raw <<= 32; pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t)); // Skip non-present PTEs if (!pte.is_present) continue; // If we find a matching PTE, return its virtual address //if (pte.addr != 0x5555555) // printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present"); if (pte.addr << 12 == addr_to_find) return i << NV_MMU_PT_V0_LSB | j << 12; } } while (++i < NV_MMU_PT_V0_SZ); return 0; // No match } */