path: root/mmu.c



// Helpers to deal with NVIDIA's MMU and associated page tables
#include <linux/kernel.h>  // Kernel types

#include "nvdebug.h"

/* One of the oldest ways to access video memory on NVIDIA GPUs is by using
  a configurable 1MB window into VRAM which is mapped into BAR0 (register)
  space starting at offset NV_PRAMIN. This is still supported on NVIDIA GPUs
  and appear to be used today to bootstrap page table configuration.

  Why is it mapped at a location called NVIDIA Private RAM Instance? Because
  this used to point to the entirety of intance RAM, which was seperate from
  VRAM on older NVIDIA GPUs.
*/

/* Convert a physical VRAM address to an offset in the PRAMIN window
  @param addr VRAM address to convert
  @return 0 on error, PRAMIN offset on success

  Note: Use off2PRAMIN() instead if you want a dereferenceable address
*/
uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr) {
	uint64_t pramin_base_va;
	bar0_window_t window;
	window.raw = nvdebug_readl(g, NV_PBUS_BAR0_WINDOW);
	// Check if the address is valid (49 bits are addressable on-GPU)
	if (addr & ~0x0001ffffffffffff) {
		printk(KERN_ERR "[nvdebug] Invalid address %llx passed to %s!\n",
		       addr, __func__);
		return 0;
	}
	// For unclear (debugging?) reasons, PRAMIN can point to SYSMEM
	if (window.target != TARGET_VID_MEM)
		return 0;
	pramin_base_va = ((uint64_t)window.base) << 16;
	// Protect against out-of-bounds accesses
	if (addr < pramin_base_va || addr > pramin_base_va + NV_PRAMIN_LEN)
		return 0;
	return addr - pramin_base_va;
}

/* NVIDIA GMMU (GPU Memory Management Unit) uses page tables that are mostly
  straight-forward starting with Pascal ("page table version 2"), except for a
  few quirks (like 16-byte PDE0 entries, but all other entries are 8 bytes).

  All you really need to know is that any given Page Directory Entry (PDE)
  contains a pointer to the start of a 4k page densely filled with PDEs or Page
  Table Entries (PTEs).

  == Page Table Refresher ==
  Page tables convert virtual addresses to physical addresses, and they do this
  via a tree structure. Leafs (PTEs) contain a physical address, and the path
  from root to leaf is defined by the virtual address. Non-leaf nodes are PDEs.
  When decending, the virtual address is sliced into pieces, and one slice is
  used at each level (as an index) to select the next-visited node (in level+1).

  V2 of NVIDIA's page table format uses 4 levels of PDEs and a final level of
  PTEs. How the virtual address is sliced to yield an index into each level and
  a page offset is shown by Fig 1.

  == Figure 1 ==
  Page Offset (12 bits) <---------------------------------------+
  Page Table Entry (PTE) (9 bits) <--------------------+        |
  Page Directory Entry (PDE) 0 (8 bits) <-----+        |        |
  PDE1 (8 bits) <--------------------+        |        |        |
  PDE2 (8 bits) <-----------+        |        |        |        |
  PDE3 (2 bits) <--+        |        |        |        |        |
                   ^        ^        ^        ^        ^        ^
  Virtual addr: [49, 47] [46, 38] [37, 29] [28, 21] [20, 12] [11, 0]

  The following arrays merely represent different projections of Fig. 1, and
  only one is strictly needed to reconstruct all the others. However, due to
  the complexity of page tables, we include all of these to aid in readability.
*/
// How many nodes/entries per level in V2 of NVIDIA's page table format
static const int NV_MMU_PT_V2_SZ[5] = {4, 512, 512, 256, 512};
// Size in bytes of an entry at a particular level
static const int NV_MMU_PT_V2_ENTRY_SZ[5] = {8, 8, 8, 16, 8};
// Which bit index is the least significant in indexing each page level
static const int NV_MMU_PT_V2_LSB[5] = {47, 38, 29, 21, 12};

// Convert a GPU physical address to CPU virtual address via the PRAMIN window
void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy) {
	return g->regs + NV_PRAMIN + vram2PRAMIN(g, phy);
}

/* FIXME
void __iomem *off2BAR2(struct nvdebug_state* g, uint32_t off) {
	return g->bar2 + off;
}
*/

uint64_t search_page_directory_subtree(struct nvdebug_state *g,
				       void __iomem *pde_offset,
				       void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
				       uint64_t addr_to_find,
				       uint32_t level) {
	uint64_t res, i;
	void __iomem *next;
	page_dir_entry_t entry;
	if (level > sizeof(NV_MMU_PT_V2_SZ))
		return 0;
	// Hack to workaround PDE0 being double-size and strangely formatted
	if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
		pde_offset += 8;
	entry.raw = readl(pde_offset);
	// If we reached an invalid (unpopulated) PDE, walk back up the tree
	if (entry.target == PD_AND_TARGET_INVALID)
		return 0;
	// Succeed when we reach a PTE with the address we want
	if (entry.is_pte) {
		printk(KERN_INFO "[nvdebug] PTE for phy addr %llx (raw: %x)\n", ((u64)entry.addr) << 12, entry.raw);
		return (uint64_t)entry.addr << 12 == addr_to_find;
	}
	printk(KERN_INFO "[nvdebug] Found PDE pointing to %llx in ap '%d' at lvl %d (raw: %x)\n", ((u64)entry.addr) << 12, entry.target, level, entry.raw);
	// Depth-first search of the page table
	for (i = 0; i < NV_MMU_PT_V2_SZ[level]; i++) {
		next = off2addr(g, ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i);
		// off2addr can fail
		if (!next) {
			printk(KERN_ERR "[nvdebug] %s: Unable to resolve GPU PA to CPU PA\n", __func__);
			return 0;
		}
		res = search_page_directory_subtree(g, next, off2addr, addr_to_find, level + 1);
		if (res)
			return res | (i << NV_MMU_PT_V2_LSB[level + 1]);
	}
	return 0;
}

/* Search a page directory of the GPU MMU
  @param pde_offset   Dereferenceable pointer to the start of the PDE3 entries
  @param off2addr     Func to converts VRAM phys addresses to valid CPU VAs
  @param addr_to_find Physical address to reconstruct the virtual address of
  @return 0 on error, otherwise the virtual address at which addr_to_find is
          mapped into by this page table.
*/
uint64_t search_page_directory(struct nvdebug_state *g,
			       void __iomem *pde_offset,
			       void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
			       uint64_t addr_to_find) {
	uint64_t res, i;
	// Make sure that the query is page-aligned
	if (addr_to_find & 0xfff) {
		printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find);
		return 0;
	}
	// Search the top-level page directory (PDE3)
	for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++)
		if ((res = search_page_directory_subtree(g, pde_offset + NV_MMU_PT_V2_ENTRY_SZ[0] * i, off2addr, addr_to_find, 0)))
			return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]);
	return 0;
}

/* GMMU Page Tables Version 1
  This page table only contains 2 levels and is used in the Fermi, Kepler, and
  Maxwell architectures
*/
// Number of entries in the PDE and PTE levels
static const int NV_MMU_PT_V1_SZ[2] = {512, 1<<13};  // 2<<13 is an educated guess!!!
// Which bit index is the least significant in indexing each page level
static const int NV_MMU_PT_V1_LSB[2] = {25, 12};  // 25 is an educated guess!!!
uint64_t search_v1_page_directory(struct nvdebug_state *g,
				  void __iomem *pde_offset,
				  void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
				  uint64_t addr_to_find) {
	uint64_t j, i = 0;
	page_dir_entry_v1_t pde;
	page_tbl_entry_v1_t pte;
	void __iomem *pte_offset;
	// For each PDE
	do {
		// readq doesn't seem to work on BAR0
		pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v1_t) + 4);
		pde.raw <<= 32;
		pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v1_t));
		// Verify PDE is present
		if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
			continue;
		// Convert to a dereferencable pointer from CPU virtual address space
		pte_offset = off2addr(g, (uint64_t)pde.alt_addr << 12);
		if (!pte_offset)
			continue;
//		printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw);
//		printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.target, pde.raw);
		// For each PTE
		for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
			// Don't overrun the PRAMIN window
			if (pte_offset > NV_PRAMIN + g->regs + NV_PRAMIN_LEN)
				return 0;
			pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v1_t) + 4);
			pte.raw <<= 32;
			pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v1_t));
			// Skip non-present PTEs
			if (!pte.is_present)
				continue;
//			printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw);
			// If we find a matching PTE, return its virtual address
			if ((uint64_t)pte.addr << 12 == addr_to_find)
				return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1];
		}
	} while (++i < NV_MMU_PT_V1_SZ[0]);
	return 0;
}

/* GMMU Page Tables Version 0
  This page table only contains 2 levels and is used in the Tesla architecture
*/
/* *** UNTESTED ***
#define NV_MMU_PT_V0_SZ 2048
#define NV_MMU_PT_V0_LSB 29
uint64_t search_v0_page_directory(struct nvdebug_state *g,
				  void __iomem *pde_offset,
				  void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t),
				  uint32_t addr_to_find) {
	int j, i = 0;
	page_dir_entry_v0_t pde;
	page_tbl_entry_v0_t pte;
	void __iomem *pte_offset;
	// For each PDE
	do {
		// readq doesn't seem to work on BAR0
		pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4);
		pde.raw <<= 32;
		pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t));
		//if (pde.raw)
		//printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw);
		// Skip unpopulated PDEs
		if (pde.type == NOT_PRESENT)
			continue;
		//printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12);
		pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12);
		// For each PTE
		for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) {
			pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4);
			pte.raw <<= 32;
			pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t));
			// Skip non-present PTEs
			if (!pte.is_present)
				continue;
			// If we find a matching PTE, return its virtual address
			//if (pte.addr != 0x5555555)
			//	printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present");
			if (pte.addr << 12 == addr_to_find)
				return i << NV_MMU_PT_V0_LSB | j << 12;
		}
	} while (++i < NV_MMU_PT_V0_SZ);
	return 0;  // No match
}
*/