aboutsummaryrefslogblamecommitdiffstats
path: root/mmu.c
blob: ababef5cb6ff08be7db48fd4ea07dc41163518f0 (plain) (tree)
1
2
3
4
5
6
7
8
9




                                                                                



                                          






                                                                   
 
                                                                             
 

                                                                         
 



                                                                      

                                                                      














                                                                                       
         













                                                                                   
                                                                                                                                     

                            
 
                  
                         
 
                                  
 
 
                                               
                                                               




                                                                              
                        
                               




                                                                            








                                                                                                                                                                                                     




                                                                            
                                               
                                                                                                                                                                                                                                                                                                                     
                                                                                                             
         
                                                                                                                                                                                                                               
                                               
                                                          
                                                                                                    
                                                                                                    
                                                                                                                           





                                                                        
                                                                                


                                                                            
 

                                                                             
 

                                                                          
                                                                            
                                                                            
                                                                            
                                                                               

                                                       


                                                                        





                                                                                                                                             
                                                                                                                                                            

                                                     
                                                                                                                                                                                                             



                                                                           


                                                                                
                                                          


                                                                           


                                

                                          














                                                                                                                                                                                             

                       







                                                                                                                                                                                                                                      
                                                     
                                              
                               
                                           


                                                                                           
                                                                                                                                                                                                       
                                                                                                                                                                                                                                                   

                                                          





                                                                                                                                                                                                                                          
                                         


                                                                
                                       
                                                   


                                                
                                                                                                                                                                                                                                                                                                        
                                                                                
                                                                                                            
                                                                                           




                                           









































                                                                                                                                                                 
/* Copyright 2024 Joshua Bakita
 * Helpers to deal with NVIDIA's MMU and associated page tables
 */
#include <linux/err.h>  // ERR_PTR() etc.
#include <linux/iommu.h>  // iommu_get_domain_for_dev() and iommu_iova_to_phys()
#include <linux/kernel.h>  // Kernel types

#include "nvdebug.h"

/* Set logging level for MMU operations
  g_verbose >= 1: Log a single message describing the MMU operation
  g_verbose >= 2: Log every PDE and PTE traversed
*/
int g_verbose = 0;
#define printk_debug if (g_verbose >= 2) printk
#define printk_info  if (g_verbose >= 1) printk

/* Convert a page directory (PD) pointer and aperture to be kernel-accessible

  I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the
  AMDGPU driver.

  @param addr  Pointer from page directory entry (PDE)
  @param pd_ap PD-type aperture (target address space) for `addr`
  @return A dereferencable kernel address, or an ERR_PTR-wrapped error
 */
static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr,
                              enum PD_TARGET pd_ap) {
	struct iommu_domain *dom;
	phys_addr_t phys;

	// Validate arguments
	if (unlikely(!IS_PD_TARGET(pd_ap) || pd_ap == PD_AND_TARGET_INVALID || !addr))
		return ERR_PTR(-EINVAL);

	// VID_MEM accesses are the simple common-case
	if (pd_ap == PD_AND_TARGET_VID_MEM) {
		// Using BAR2 requires a page-table traversal. As this function is part
		// of the page-table traversal process, it must instead use PRAMIN.
		int off = addr_to_pramin_mut(g, addr, TARGET_VID_MEM);
		if (off < 0)
			return ERR_PTR(off);
		return g->regs + NV_PRAMIN + off;
	}
	/* SYS_MEM accesses are rare. Only nvgpu (Jetson driver), nouveau, and this
	 * driver are known to create page directory entries in SYS_MEM.
	 *
	 * On systems using an I/O MMU, or some other I/O virtual address space,
	 * these are **not** physical addresses, and must first be translated
	 * through the I/O MMU before use.
	 * Example default meaning of a SYS_MEM address for a few CPUs:
	 * - Jetson Xavier : physical address
	 * - AMD 3950X     : I/O MMU address
	 * - Phenom II x4  : physical address
	 */
	// Check for, and translate through, the I/O MMU (if any)
	if ((dom = iommu_get_domain_for_dev(g->dev))) {
		phys = iommu_iova_to_phys(dom, addr);
		printk_debug(KERN_DEBUG "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#lx to physical address %#llx.\n", addr, phys);
	} else
		phys = addr;

	if (!phys)
		return 0;

	return phys_to_virt(phys);
}

// Internal helper for search_page_directory().
uint64_t search_page_directory_subtree(struct nvdebug_state *g,
                                       uintptr_t pde_addr,
                                       enum PD_TARGET pde_target,
                                       uint64_t addr_to_find,
                                       enum INST_TARGET addr_to_find_aperture,
                                       uint32_t level) {
	uint64_t res, i;
	void __iomem *pde_kern;
	page_dir_entry_t entry;
	if (level > sizeof(NV_MMU_PT_V2_SZ))
		return 0;
	// Hack to workaround PDE0 being double-size and strangely formatted
	if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
		pde_addr += 8;
	// Translate a VID_MEM/SYS_MEM-space address to something kernel-accessible
	pde_kern = pd_deref(g, pde_addr, pde_target);
	if (IS_ERR_OR_NULL(pde_kern)) {
		printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_addr, pd_target_to_text(pde_target), PTR_ERR(pde_kern));
		return 0;
	}
	// Read the page directory entry (a pointer to another directory, or a PTE)
	entry.raw_w = readq(pde_kern);
	// If we reached an invalid (unpopulated) PDE, walk back up the tree
	if (entry.target == PD_AND_TARGET_INVALID)
		return 0;
	// Succeed when we reach a PTE with the address we want
	if (entry.is_pte) {
		// TODO: Handle huge pages here
		printk_debug(KERN_DEBUG "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w);
		return (uint64_t)entry.addr << 12 == addr_to_find && entry.aperture == addr_to_find_aperture;
	}
	printk_debug(KERN_DEBUG "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w);
	// Depth-first search of the page table
	for (i = 0; i < NV_MMU_PT_V2_SZ[level + 1]; i++) {
		uint64_t next = ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i;
		printk_debug(KERN_DEBUG "[nvdebug] Searching index %llu in lvl %d\n", i, level + 1);
		res = search_page_directory_subtree(g, next, entry.target, addr_to_find, addr_to_find_aperture, level + 1);
		if (res)
			return res | (i << NV_MMU_PT_V2_LSB[level + 1]);
	}
	return 0;
}

/* GPU Physical address -> Virtual address ("reverse" translation) for V2 tables
  Depth-first search a page directory of the GPU MMU for where a particular
  physical address is mapped. Upon finding a mapping, the virtual address is
  returned.

  The page directory and tables may be located in VID_MEM, SYS_MEM, or spread
  across multiple apertures.

  @param pd_config    Page Directory configuration, containing pointer and
                      aperture for the start of the PDE3 entries
  @param addr_to_find Physical address to reconstruct the virtual address of
  @param addr_to_find_aperture Aperture (SYS_MEM or VID_MEM) of addr_to_find
  @return 0 on error, otherwise the virtual address at which addr_to_find is
          mapped into by this page table. (Zero is not a valid virtual address)
*/
uint64_t search_page_directory(struct nvdebug_state *g,
                               page_dir_config_t pd_config,
                               uint64_t addr_to_find,
                               enum INST_TARGET addr_to_find_aperture) {
	uint64_t res, i;
	// Make sure that the query is page-aligned
	if (addr_to_find & 0xfff) {
		printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find);
		return 0;
	}
	printk_info(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018lx\n", addr_to_find, (uintptr_t)pd_config.page_dir << 12);
	// Search the top-level page directory (PDE3)
	for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++)
		if ((res = search_page_directory_subtree(g, ((uintptr_t)pd_config.page_dir << 12) + NV_MMU_PT_V2_ENTRY_SZ[0] * i, INST2PD_TARGET(pd_config.target), addr_to_find, addr_to_find_aperture, 0)))
			return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]);
	return 0;
}

/* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables
  (See `search_page_directory()` for documentation.)
 */
uint64_t search_v1_page_directory(struct nvdebug_state *g,
                                  page_dir_config_t pd_config,
                                  uint64_t addr_to_find,
                                  enum INST_TARGET addr_to_find_aperture) {
	uint64_t j, i = 0;
	page_dir_entry_v1_t pde;
	page_tbl_entry_v1_t pte;
	uintptr_t pte_offset, pde_offset;
	void __iomem *pte_addr, *pde_addr;

	// This function only understands the Page Table Version 1 format
	if (pd_config.is_ver2) {
		printk(KERN_ERR "[nvdebug] Passed a Version 2 page table at %#018llx to translate_v1_page_directory()!\n", (uint64_t)pd_config.page_dir << 12);
		return 0;
	}

	// We only understand the Version 1 format when 128 KiB huge pages are in-use
	if (pd_config.is_64k_big_page) {
		printk(KERN_ERR "[nvdebug] Page Table Version 1 with 64 KiB huge pages is unsupported!\n");
		return 0;
	}

	printk_info(KERN_INFO "[nvdebug] Searching V1 page table at %#018lx in %s for addr %#018llx\n", (uintptr_t)pd_config.page_dir << 12, target_to_text(pd_config.target), addr_to_find);

	// For each PDE
	do {
		// Index the list of page directory entries
		pde_offset = ((uint64_t)pd_config.page_dir << 12) + i * sizeof(page_dir_entry_v1_t);
		// Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
		pde_addr = pd_deref(g, pde_offset, INST2PD_TARGET(pd_config.target));
		if (IS_ERR_OR_NULL(pde_addr)) {
			printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_offset, pd_target_to_text(INST2PD_TARGET(pd_config.target)), -PTR_ERR(pde_addr));
			return 0;
		}
		// readq doesn't seem to work on BAR0
		pde.raw = readl(pde_addr + 4);
		pde.raw <<= 32;
		pde.raw |= readl(pde_addr);
		// Verify PDE is present
		if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
			continue;
//		printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw);
		printk_debug(KERN_DEBUG "[nvdebug] Found %s PDE at index %lld pointing to PTEs @ %#018llx in ap '%d' (raw: %#018llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", i, ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw);
		// For each PTE
		for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
			// Index the list of page table entries starting at pde.alt_addr
			pte_offset = ((uint64_t)pde.alt_addr << 12) + j * sizeof(page_tbl_entry_v1_t);
			// Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
			pte_addr = pd_deref(g, pte_offset, V12PD_TARGET(pde.alt_target));
			if (IS_ERR_OR_NULL(pte_addr)) {
				printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pte_offset, pd_target_to_text(V12PD_TARGET(pde.alt_target)), -PTR_ERR(pte_addr));
				return 0;
			}
			// Read page table entry, avoiding readq
			pte.raw = readl(pte_addr + 4);
			pte.raw <<= 32;
			pte.raw |= readl(pte_addr);
			// Skip non-present PTEs
			if (!pte.is_present)
				continue;
			printk_debug(KERN_DEBUG "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)pte.addr) << 12, target_to_text(pte.target), pte.is_volatile, pte.is_privileged, pte.is_readonly, pte.atomics_disabled, pte.raw);
			// If we find a matching PTE, return its virtual address
			if ((uint64_t)pte.addr << 12 == addr_to_find && pte.target == addr_to_find_aperture)
				return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1];
		}
	} while (++i < NV_MMU_PT_V1_SZ[0]);
	return 0;
}

/* *** UNTESTED ***
#define NV_MMU_PT_V0_SZ 2048
#define NV_MMU_PT_V0_LSB 29
uint64_t search_v0_page_directory(struct nvdebug_state *g,
				  void __iomem *pde_offset,
				  void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t),
				  uint32_t addr_to_find) {
	int j, i = 0;
	page_dir_entry_v0_t pde;
	page_tbl_entry_v0_t pte;
	void __iomem *pte_offset;
	// For each PDE
	do {
		// readq doesn't seem to work on BAR0
		pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4);
		pde.raw <<= 32;
		pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t));
		//if (pde.raw)
		//printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw);
		// Skip unpopulated PDEs
		if (pde.type == NOT_PRESENT)
			continue;
		//printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12);
		pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12);
		// For each PTE
		for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) {
			pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4);
			pte.raw <<= 32;
			pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t));
			// Skip non-present PTEs
			if (!pte.is_present)
				continue;
			// If we find a matching PTE, return its virtual address
			//if (pte.addr != 0x5555555)
			//	printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present");
			if (pte.addr << 12 == addr_to_find)
				return i << NV_MMU_PT_V0_LSB | j << 12;
		}
	} while (++i < NV_MMU_PT_V0_SZ);
	return 0;  // No match
}
*/