aboutsummaryrefslogtreecommitdiffstats
path: root/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'mmu.c')
-rw-r--r--mmu.c206
1 files changed, 110 insertions, 96 deletions
diff --git a/mmu.c b/mmu.c
index e420864..70c00f9 100644
--- a/mmu.c
+++ b/mmu.c
@@ -1,117 +1,129 @@
1// Helpers to deal with NVIDIA's MMU and associated page tables 1/* Copyright 2024 Joshua Bakita
2 * Helpers to deal with NVIDIA's MMU and associated page tables
3 */
4#include <linux/err.h> // ERR_PTR() etc.
5#include <linux/iommu.h> // iommu_get_domain_for_dev() and iommu_iova_to_phys()
2#include <linux/kernel.h> // Kernel types 6#include <linux/kernel.h> // Kernel types
3 7
4#include "nvdebug.h" 8#include "nvdebug.h"
5 9
6/* One of the oldest ways to access video memory on NVIDIA GPUs is by using 10// Uncomment to print every PDE and PTE walked for debugging
7 a configurable 1MB window into VRAM which is mapped into BAR0 (register) 11//#define DEBUG
8 space starting at offset NV_PRAMIN. This is still supported on NVIDIA GPUs 12#ifdef DEBUG
9 and appear to be used today to bootstrap page table configuration. 13#define printk_debug printk
14#else
15#define printk_debug(...)
16#endif
10 17
11 Why is it mapped at a location called NVIDIA Private RAM Instance? Because 18/* Convert a page directory (PD) pointer and aperture to be kernel-accessible
12 this used to point to the entirety of intance RAM, which was seperate from
13 VRAM on older NVIDIA GPUs.
14*/
15 19
16/* Convert a physical VRAM address to an offset in the PRAMIN window 20 I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the
17 @param addr VRAM address to convert 21 AMDGPU driver.
18 @return -errno on error, PRAMIN offset on success
19 22
20 Note: Use off2PRAMIN() instead if you want a dereferenceable address 23 @param addr Pointer from page directory entry (PDE)
21 Note: PRAMIN window is only 1MB, so returning an int is safe 24 @param pd_ap PD-type aperture (target address space) for `addr`
22*/ 25 @return A dereferencable kernel address, or an ERR_PTR-wrapped error
23static int vram2PRAMIN(struct nvdebug_state *g, uint64_t addr) { 26 */
24 uint64_t pramin_base_va; 27void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, enum PD_TARGET pd_ap) {
25 bar0_window_t window; 28 struct iommu_domain *dom;
26 window.raw = nvdebug_readl(g, NV_PBUS_BAR0_WINDOW); 29 phys_addr_t phys;
27 // Check if the address is valid (49 bits are addressable on-GPU) 30
28 if (addr & ~0x0001ffffffffffff) { 31 // Validate arguments
29 printk(KERN_ERR "[nvdebug] Invalid address %llx passed to %s!\n", 32 if (unlikely(!IS_PD_TARGET(pd_ap) || pd_ap == PD_AND_TARGET_INVALID || !addr))
30 addr, __func__); 33 return ERR_PTR(-EINVAL);
31 return -EINVAL; 34
35 // VID_MEM accesses are the simple common-case
36 if (pd_ap == PD_AND_TARGET_VID_MEM) {
37 // Using BAR2 requires a page-table traversal. As this function is part
38 // of the page-table traversal process, it must instead use PRAMIN.
39 int off = addr_to_pramin_mut(g, addr, TARGET_VID_MEM);
40 if (off < 0)
41 return ERR_PTR(off);
42 return g->regs + NV_PRAMIN + off;
32 } 43 }
33 // For unclear (debugging?) reasons, PRAMIN can point to SYSMEM 44 /* SYS_MEM accesses are rare. Only nvgpu (Jetson driver), nouveau, and this
34 if (window.target != TARGET_VID_MEM) 45 * driver are known to create page directory entries in SYS_MEM.
35 return -EFAULT; 46 *
36 pramin_base_va = ((uint64_t)window.base) << 16; 47 * On systems using an I/O MMU, or some other I/O virtual address space,
37 // Protect against out-of-bounds accesses 48 * these are **not** physical addresses, and must first be translated
38 if (addr < pramin_base_va || addr > pramin_base_va + NV_PRAMIN_LEN) 49 * through the I/O MMU before use.
39 return -ERANGE; 50 * Example default meaning of a SYS_MEM address for a few CPUs:
40 return addr - pramin_base_va; 51 * - Jetson Xavier : physical address
41} 52 * - AMD 3950X : I/O MMU address
53 * - Phenom II x4 : physical address
54 */
55 // Check for, and translate through, the I/O MMU (if any)
56 if ((dom = iommu_get_domain_for_dev(g->dev))) {
57 phys = iommu_iova_to_phys(dom, addr);
58 printk(KERN_ERR "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#lx to physical address %llx.\n", addr, phys);
59 } else
60 phys = addr;
42 61
43// Convert a GPU physical address to CPU virtual address via the PRAMIN window 62 if (!phys)
44// @return A dereferencable address, or 0 (an invalid physical address) on err
45void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy) {
46 int off = vram2PRAMIN(g, phy);
47 if (off == -ERANGE)
48 printk(KERN_ERR "[nvdebug] Page table walk off end of PRAMIN!\n");
49 if (off < 0)
50 return 0; 63 return 0;
51 return g->regs + NV_PRAMIN + vram2PRAMIN(g, phy);
52}
53 64
54/* FIXME 65 return phys_to_virt(addr);
55void __iomem *off2BAR2(struct nvdebug_state* g, uint32_t off) {
56 return g->bar2 + off;
57} 66}
58*/
59 67
60// Internal helper for search_page_directory(). 68// Internal helper for search_page_directory().
61uint64_t search_page_directory_subtree(struct nvdebug_state *g, 69uint64_t search_page_directory_subtree(struct nvdebug_state *g,
62 void __iomem *pde_offset, 70 uintptr_t pde_addr,
63 void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), 71 enum PD_TARGET pde_target,
64 uint64_t addr_to_find, 72 uint64_t addr_to_find,
65 uint32_t level) { 73 uint32_t level) {
66 uint64_t res, i; 74 uint64_t res, i;
67 void __iomem *next; 75 void __iomem *pde_kern;
68 page_dir_entry_t entry; 76 page_dir_entry_t entry;
69 if (level > sizeof(NV_MMU_PT_V2_SZ)) 77 if (level > sizeof(NV_MMU_PT_V2_SZ))
70 return 0; 78 return 0;
71 // Hack to workaround PDE0 being double-size and strangely formatted 79 // Hack to workaround PDE0 being double-size and strangely formatted
72 if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16) 80 if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
73 pde_offset += 8; 81 pde_addr += 8;
74 entry.raw_w = readq(pde_offset); 82 // Translate a VID_MEM/SYS_MEM-space address to something kernel-accessible
83 pde_kern = pd_deref(g, pde_addr, pde_target);
84 if (IS_ERR_OR_NULL(pde_kern)) {
85 printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_addr, pd_target_to_text(pde_target), PTR_ERR(pde_kern));
86 return 0;
87 }
88 // Read the page directory entry (a pointer to another directory, or a PTE)
89 entry.raw_w = readq(pde_kern);
75 // If we reached an invalid (unpopulated) PDE, walk back up the tree 90 // If we reached an invalid (unpopulated) PDE, walk back up the tree
76 if (entry.target == PD_AND_TARGET_INVALID) 91 if (entry.target == PD_AND_TARGET_INVALID)
77 return 0; 92 return 0;
78 // Succeed when we reach a PTE with the address we want 93 // Succeed when we reach a PTE with the address we want
79 if (entry.is_pte) { 94 if (entry.is_pte) {
80 // TODO: Handle huge pages here 95 // TODO: Handle huge pages here
81 printk(KERN_INFO "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w); 96 printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w);
82 return (uint64_t)entry.addr << 12 == addr_to_find; 97 return (uint64_t)entry.addr << 12 == addr_to_find;
83 } 98 }
84 printk(KERN_INFO "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w); 99 printk_debug(KERN_INFO "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w);
85 // Depth-first search of the page table 100 // Depth-first search of the page table
86 for (i = 0; i < NV_MMU_PT_V2_SZ[level + 1]; i++) { 101 for (i = 0; i < NV_MMU_PT_V2_SZ[level + 1]; i++) {
87 next = off2addr(g, ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i); 102 uint64_t next = ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i;
88 // off2addr can fail 103 res = search_page_directory_subtree(g, next, entry.target, addr_to_find, level + 1);
89 if (!next || !entry.addr_w) {
90 printk(KERN_ERR "[nvdebug] %s: Unable to resolve GPU PA to CPU PA\n", __func__);
91 return 0;
92 }
93 res = search_page_directory_subtree(g, next, off2addr, addr_to_find, level + 1);
94 if (res) 104 if (res)
95 return res | (i << NV_MMU_PT_V2_LSB[level + 1]); 105 return res | (i << NV_MMU_PT_V2_LSB[level + 1]);
96 } 106 }
97 return 0; 107 return 0;
98} 108}
99 109
100/* GPU Physical address -> Virtual address ("reverse" translation) 110/* GPU Physical address -> Virtual address ("reverse" translation) for V2 tables
111
112 Depth-first search a page directory of the GPU MMU for where a particular
113 physical address is mapped. Upon finding a mapping, the virtual address is
114 returned.
101 115
102 Depth-first search a page directory of the GPU MMU for where a particular 116 The page directory may be located in VID_MEM, SYS_MEM, or some combination of
103 physical address is mapped. Upon finding a mapping, the virtual address is 117 the two.
104 returned.
105 118
106 @param pde_offset Dereferenceable pointer to the start of the PDE3 entries 119 @param pd_config Page Directory configuration, containing pointer and
107 @param off2addr Func to convert VRAM phys addresses to valid CPU VAs 120 aperture for the start of the PDE3 entries
108 @param addr_to_find Physical address to reconstruct the virtual address of 121 @param addr_to_find Physical address to reconstruct the virtual address of
109 @return 0 on error, otherwise the virtual address at which addr_to_find is 122 @return 0 on error, otherwise the virtual address at which addr_to_find is
110 mapped into by this page table. (Zero is not a valid virtual address) 123 mapped into by this page table. (Zero is not a valid virtual address)
111*/ 124*/
112uint64_t search_page_directory(struct nvdebug_state *g, 125uint64_t search_page_directory(struct nvdebug_state *g,
113 void __iomem *pde_offset, 126 page_dir_config_t pd_config,
114 void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
115 uint64_t addr_to_find) { 127 uint64_t addr_to_find) {
116 uint64_t res, i; 128 uint64_t res, i;
117 // Make sure that the query is page-aligned 129 // Make sure that the query is page-aligned
@@ -119,57 +131,62 @@ uint64_t search_page_directory(struct nvdebug_state *g,
119 printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find); 131 printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find);
120 return 0; 132 return 0;
121 } 133 }
122 printk(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018llx\n", (u64)addr_to_find, (u64)pde_offset); 134 printk(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018lx\n", addr_to_find, (uintptr_t)pd_config.page_dir << 12);
123 // Search the top-level page directory (PDE3) 135 // Search the top-level page directory (PDE3)
124 for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++) 136 for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++)
125 if ((res = search_page_directory_subtree(g, pde_offset + NV_MMU_PT_V2_ENTRY_SZ[0] * i, off2addr, addr_to_find, 0))) 137 if ((res = search_page_directory_subtree(g, ((uintptr_t)pd_config.page_dir << 12) + NV_MMU_PT_V2_ENTRY_SZ[0] * i, INST2PD_TARGET(pd_config.target), addr_to_find, 0)))
126 return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]); 138 return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]);
127 return 0; 139 return 0;
128} 140}
129 141
130/* GMMU Page Tables Version 1 142/* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables
131 This page table only contains 2 levels and is used in the Fermi, Kepler, and 143 (See `search_page_directory()` for documentation.)
132 Maxwell architectures 144 */
133*/
134// Number of entries in the PDE and PTE levels
135static const int NV_MMU_PT_V1_SZ[2] = {512, 1<<13}; // 2<<13 is an educated guess!!!
136// Which bit index is the least significant in indexing each page level
137static const int NV_MMU_PT_V1_LSB[2] = {25, 12}; // 25 is an educated guess!!!
138uint64_t search_v1_page_directory(struct nvdebug_state *g, 145uint64_t search_v1_page_directory(struct nvdebug_state *g,
139 void __iomem *pde_offset, 146 page_dir_config_t pd_config,
140 void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
141 uint64_t addr_to_find) { 147 uint64_t addr_to_find) {
142 uint64_t j, i = 0; 148 uint64_t j, i = 0;
143 page_dir_entry_v1_t pde; 149 page_dir_entry_v1_t pde;
144 page_tbl_entry_v1_t pte; 150 page_tbl_entry_v1_t pte;
145 void __iomem *pte_offset; 151 uintptr_t pte_offset, pde_offset;
152 void __iomem *pte_addr, *pde_addr;
146 // For each PDE 153 // For each PDE
147 do { 154 do {
155 // Index the list of page directory entries
156 pde_offset = ((uint64_t)pd_config.page_dir << 12) + i * sizeof(page_dir_entry_v1_t);
157 // Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
158 pde_addr = pd_deref(g, pde_offset, INST2PD_TARGET(pd_config.target));
159 if (IS_ERR_OR_NULL(pde_addr)) {
160 printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_offset, pd_target_to_text(INST2PD_TARGET(pd_config.target)), -PTR_ERR(pde_addr));
161 return 0;
162 }
148 // readq doesn't seem to work on BAR0 163 // readq doesn't seem to work on BAR0
149 pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v1_t) + 4); 164 pde.raw = readl(pde_addr + 4);
150 pde.raw <<= 32; 165 pde.raw <<= 32;
151 pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v1_t)); 166 pde.raw |= readl(pde_addr);
152 // Verify PDE is present 167 // Verify PDE is present
153 if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID) 168 if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
154 continue; 169 continue;
155 // Convert to a dereferencable pointer from CPU virtual address space
156 pte_offset = off2addr(g, (uint64_t)pde.alt_addr << 12);
157 if (!pte_offset)
158 continue;
159// printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw); 170// printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw);
160// printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.target, pde.raw); 171 printk_debug(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw);
161 // For each PTE 172 // For each PTE
162 for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) { 173 for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
163 // Don't overrun the PRAMIN window 174 // Index the list of page table entries starting at pde.alt_addr
164 if (pte_offset > NV_PRAMIN + g->regs + NV_PRAMIN_LEN) 175 pte_offset = ((uint64_t)pde.alt_addr << 12) + j * sizeof(page_tbl_entry_v1_t);
176 // Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
177 pte_addr = pd_deref(g, pte_offset, V12PD_TARGET(pde.alt_target));
178 if (IS_ERR_OR_NULL(pte_addr)) {
179 printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pte_offset, pd_target_to_text(V12PD_TARGET(pde.alt_target)), -PTR_ERR(pte_addr));
165 return 0; 180 return 0;
166 pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v1_t) + 4); 181 }
182 // Read page table entry, avoiding readq
183 pte.raw = readl(pte_addr + 4);
167 pte.raw <<= 32; 184 pte.raw <<= 32;
168 pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v1_t)); 185 pte.raw |= readl(pte_addr);
169 // Skip non-present PTEs 186 // Skip non-present PTEs
170 if (!pte.is_present) 187 if (!pte.is_present)
171 continue; 188 continue;
172// printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw); 189 printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw);
173 // If we find a matching PTE, return its virtual address 190 // If we find a matching PTE, return its virtual address
174 if ((uint64_t)pte.addr << 12 == addr_to_find) 191 if ((uint64_t)pte.addr << 12 == addr_to_find)
175 return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1]; 192 return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1];
@@ -178,9 +195,6 @@ uint64_t search_v1_page_directory(struct nvdebug_state *g,
178 return 0; 195 return 0;
179} 196}
180 197
181/* GMMU Page Tables Version 0
182 This page table only contains 2 levels and is used in the Tesla architecture
183*/
184/* *** UNTESTED *** 198/* *** UNTESTED ***
185#define NV_MMU_PT_V0_SZ 2048 199#define NV_MMU_PT_V0_SZ 2048
186#define NV_MMU_PT_V0_LSB 29 200#define NV_MMU_PT_V0_LSB 29