diff options
Diffstat (limited to 'mmu.c')
-rw-r--r-- | mmu.c | 251 |
1 files changed, 251 insertions, 0 deletions
@@ -0,0 +1,251 @@ | |||
1 | // Helpers to deal with NVIDIA's MMU and associated page tables | ||
2 | #include <linux/kernel.h> // Kernel types | ||
3 | |||
4 | #include "nvdebug.h" | ||
5 | |||
6 | /* One of the oldest ways to access video memory on NVIDIA GPUs is by using | ||
7 | a configurable 1MB window into VRAM which is mapped into BAR0 (register) | ||
8 | space starting at offset NV_PRAMIN. This is still supported on NVIDIA GPUs | ||
9 | and appear to be used today to bootstrap page table configuration. | ||
10 | |||
11 | Why is it mapped at a location called NVIDIA Private RAM Instance? Because | ||
12 | this used to point to the entirety of intance RAM, which was seperate from | ||
13 | VRAM on older NVIDIA GPUs. | ||
14 | */ | ||
15 | |||
16 | /* Convert a physical VRAM address to an offset in the PRAMIN window | ||
17 | @param addr VRAM address to convert | ||
18 | @return 0 on error, PRAMIN offset on success | ||
19 | |||
20 | Note: Use off2PRAMIN() instead if you want a dereferenceable address | ||
21 | */ | ||
22 | uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr) { | ||
23 | uint64_t pramin_base_va; | ||
24 | bar0_window_t window; | ||
25 | window.raw = nvdebug_readl(g, NV_PBUS_BAR0_WINDOW); | ||
26 | // Check if the address is valid (49 bits are addressable on-GPU) | ||
27 | if (addr & ~0x0001ffffffffffff) { | ||
28 | printk(KERN_ERR "[nvdebug] Invalid address %llx passed to %s!\n", | ||
29 | addr, __func__); | ||
30 | return 0; | ||
31 | } | ||
32 | // For unclear (debugging?) reasons, PRAMIN can point to SYSMEM | ||
33 | if (window.target != TARGET_VID_MEM) | ||
34 | return 0; | ||
35 | pramin_base_va = ((uint64_t)window.base) << 16; | ||
36 | // Protect against out-of-bounds accesses | ||
37 | if (addr < pramin_base_va || addr > pramin_base_va + NV_PRAMIN_LEN) | ||
38 | return 0; | ||
39 | return addr - pramin_base_va; | ||
40 | } | ||
41 | |||
42 | /* NVIDIA GMMU (GPU Memory Management Unit) uses page tables that are mostly | ||
43 | straight-forward starting with Pascal ("page table version 2"), except for a | ||
44 | few quirks (like 16-byte PDE0 entries, but all other entries are 8 bytes). | ||
45 | |||
46 | All you really need to know is that any given Page Directory Entry (PDE) | ||
47 | contains a pointer to the start of a 4k page densely filled with PDEs or Page | ||
48 | Table Entries (PTEs). | ||
49 | |||
50 | == Page Table Refresher == | ||
51 | Page tables convert virtual addresses to physical addresses, and they do this | ||
52 | via a tree structure. Leafs (PTEs) contain a physical address, and the path | ||
53 | from root to leaf is defined by the virtual address. Non-leaf nodes are PDEs. | ||
54 | When decending, the virtual address is sliced into pieces, and one slice is | ||
55 | used at each level (as an index) to select the next-visited node (in level+1). | ||
56 | |||
57 | V2 of NVIDIA's page table format uses 4 levels of PDEs and a final level of | ||
58 | PTEs. How the virtual address is sliced to yield an index into each level and | ||
59 | a page offset is shown by Fig 1. | ||
60 | |||
61 | == Figure 1 == | ||
62 | Page Offset (12 bits) <---------------------------------------+ | ||
63 | Page Table Entry (PTE) (9 bits) <--------------------+ | | ||
64 | Page Directory Entry (PDE) 0 (8 bits) <-----+ | | | ||
65 | PDE1 (8 bits) <--------------------+ | | | | ||
66 | PDE2 (8 bits) <-----------+ | | | | | ||
67 | PDE3 (2 bits) <--+ | | | | | | ||
68 | ^ ^ ^ ^ ^ ^ | ||
69 | Virtual addr: [49, 47] [46, 38] [37, 29] [28, 21] [20, 12] [11, 0] | ||
70 | |||
71 | The following arrays merely represent different projections of Fig. 1, and | ||
72 | only one is strictly needed to reconstruct all the others. However, due to | ||
73 | the complexity of page tables, we include all of these to aid in readability. | ||
74 | */ | ||
75 | // How many nodes/entries per level in V2 of NVIDIA's page table format | ||
76 | static const int NV_MMU_PT_V2_SZ[5] = {4, 512, 512, 256, 512}; | ||
77 | // Size in bytes of an entry at a particular level | ||
78 | static const int NV_MMU_PT_V2_ENTRY_SZ[5] = {8, 8, 8, 16, 8}; | ||
79 | // Which bit index is the least significant in indexing each page level | ||
80 | static const int NV_MMU_PT_V2_LSB[5] = {47, 38, 29, 21, 12}; | ||
81 | |||
82 | // Convert a GPU physical address to CPU virtual address via the PRAMIN window | ||
83 | void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy) { | ||
84 | return g->regs + NV_PRAMIN + vram2PRAMIN(g, phy); | ||
85 | } | ||
86 | |||
87 | /* FIXME | ||
88 | void __iomem *off2BAR2(struct nvdebug_state* g, uint32_t off) { | ||
89 | return g->bar2 + off; | ||
90 | } | ||
91 | */ | ||
92 | |||
93 | uint64_t search_page_directory_subtree(struct nvdebug_state *g, | ||
94 | void __iomem *pde_offset, | ||
95 | void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), | ||
96 | uint64_t addr_to_find, | ||
97 | uint32_t level) { | ||
98 | uint64_t res, i; | ||
99 | void __iomem *next; | ||
100 | page_dir_entry_t entry; | ||
101 | if (level > sizeof(NV_MMU_PT_V2_SZ)) | ||
102 | return 0; | ||
103 | // Hack to workaround PDE0 being double-size and strangely formatted | ||
104 | if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16) | ||
105 | pde_offset += 8; | ||
106 | entry.raw = readl(pde_offset); | ||
107 | // If we reached an invalid (unpopulated) PDE, walk back up the tree | ||
108 | if (entry.target == PD_AND_TARGET_INVALID) | ||
109 | return 0; | ||
110 | // Succeed when we reach a PTE with the address we want | ||
111 | if (entry.is_pte) { | ||
112 | printk(KERN_INFO "[nvdebug] PTE for phy addr %llx (raw: %x)\n", ((u64)entry.addr) << 12, entry.raw); | ||
113 | return (uint64_t)entry.addr << 12 == addr_to_find; | ||
114 | } | ||
115 | printk(KERN_INFO "[nvdebug] Found PDE pointing to %llx in ap '%d' at lvl %d (raw: %x)\n", ((u64)entry.addr) << 12, entry.target, level, entry.raw); | ||
116 | // Depth-first search of the page table | ||
117 | for (i = 0; i < NV_MMU_PT_V2_SZ[level]; i++) { | ||
118 | next = off2addr(g, ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i); | ||
119 | // off2addr can fail | ||
120 | if (!next) { | ||
121 | printk(KERN_ERR "[nvdebug] %s: Unable to resolve GPU PA to CPU PA\n", __func__); | ||
122 | return 0; | ||
123 | } | ||
124 | res = search_page_directory_subtree(g, next, off2addr, addr_to_find, level + 1); | ||
125 | if (res) | ||
126 | return res | (i << NV_MMU_PT_V2_LSB[level + 1]); | ||
127 | } | ||
128 | return 0; | ||
129 | } | ||
130 | |||
131 | /* Search a page directory of the GPU MMU | ||
132 | @param pde_offset Dereferenceable pointer to the start of the PDE3 entries | ||
133 | @param off2addr Func to converts VRAM phys addresses to valid CPU VAs | ||
134 | @param addr_to_find Physical address to reconstruct the virtual address of | ||
135 | @return 0 on error, otherwise the virtual address at which addr_to_find is | ||
136 | mapped into by this page table. | ||
137 | */ | ||
138 | uint64_t search_page_directory(struct nvdebug_state *g, | ||
139 | void __iomem *pde_offset, | ||
140 | void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), | ||
141 | uint64_t addr_to_find) { | ||
142 | uint64_t res, i; | ||
143 | // Make sure that the query is page-aligned | ||
144 | if (addr_to_find & 0xfff) { | ||
145 | printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find); | ||
146 | return 0; | ||
147 | } | ||
148 | // Search the top-level page directory (PDE3) | ||
149 | for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++) | ||
150 | if ((res = search_page_directory_subtree(g, pde_offset + NV_MMU_PT_V2_ENTRY_SZ[0] * i, off2addr, addr_to_find, 0))) | ||
151 | return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]); | ||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | /* GMMU Page Tables Version 1 | ||
156 | This page table only contains 2 levels and is used in the Fermi, Kepler, and | ||
157 | Maxwell architectures | ||
158 | */ | ||
159 | // Number of entries in the PDE and PTE levels | ||
160 | static const int NV_MMU_PT_V1_SZ[2] = {512, 1<<13}; // 2<<13 is an educated guess!!! | ||
161 | // Which bit index is the least significant in indexing each page level | ||
162 | static const int NV_MMU_PT_V1_LSB[2] = {25, 12}; // 25 is an educated guess!!! | ||
163 | uint64_t search_v1_page_directory(struct nvdebug_state *g, | ||
164 | void __iomem *pde_offset, | ||
165 | void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), | ||
166 | uint64_t addr_to_find) { | ||
167 | uint64_t j, i = 0; | ||
168 | page_dir_entry_v1_t pde; | ||
169 | page_tbl_entry_v1_t pte; | ||
170 | void __iomem *pte_offset; | ||
171 | // For each PDE | ||
172 | do { | ||
173 | // readq doesn't seem to work on BAR0 | ||
174 | pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v1_t) + 4); | ||
175 | pde.raw <<= 32; | ||
176 | pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v1_t)); | ||
177 | // Verify PDE is present | ||
178 | if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID) | ||
179 | continue; | ||
180 | // Convert to a dereferencable pointer from CPU virtual address space | ||
181 | pte_offset = off2addr(g, (uint64_t)pde.alt_addr << 12); | ||
182 | if (!pte_offset) | ||
183 | continue; | ||
184 | // printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw); | ||
185 | // printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.target, pde.raw); | ||
186 | // For each PTE | ||
187 | for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) { | ||
188 | // Don't overrun the PRAMIN window | ||
189 | if (pte_offset > NV_PRAMIN + g->regs + NV_PRAMIN_LEN) | ||
190 | return 0; | ||
191 | pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v1_t) + 4); | ||
192 | pte.raw <<= 32; | ||
193 | pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v1_t)); | ||
194 | // Skip non-present PTEs | ||
195 | if (!pte.is_present) | ||
196 | continue; | ||
197 | // printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw); | ||
198 | // If we find a matching PTE, return its virtual address | ||
199 | if ((uint64_t)pte.addr << 12 == addr_to_find) | ||
200 | return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1]; | ||
201 | |||
202 | } | ||
203 | } while (++i < NV_MMU_PT_V1_SZ[0]); | ||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | /* GMMU Page Tables Version 0 | ||
208 | This page table only contains 2 levels and is used in the Tesla architecture | ||
209 | */ | ||
210 | /* *** UNTESTED *** | ||
211 | #define NV_MMU_PT_V0_SZ 2048 | ||
212 | #define NV_MMU_PT_V0_LSB 29 | ||
213 | uint64_t search_v0_page_directory(struct nvdebug_state *g, | ||
214 | void __iomem *pde_offset, | ||
215 | void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t), | ||
216 | uint32_t addr_to_find) { | ||
217 | int j, i = 0; | ||
218 | page_dir_entry_v0_t pde; | ||
219 | page_tbl_entry_v0_t pte; | ||
220 | void __iomem *pte_offset; | ||
221 | // For each PDE | ||
222 | do { | ||
223 | // readq doesn't seem to work on BAR0 | ||
224 | pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4); | ||
225 | pde.raw <<= 32; | ||
226 | pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t)); | ||
227 | //if (pde.raw) | ||
228 | //printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw); | ||
229 | // Skip unpopulated PDEs | ||
230 | if (pde.type == NOT_PRESENT) | ||
231 | continue; | ||
232 | //printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12); | ||
233 | pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12); | ||
234 | // For each PTE | ||
235 | for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) { | ||
236 | pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4); | ||
237 | pte.raw <<= 32; | ||
238 | pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t)); | ||
239 | // Skip non-present PTEs | ||
240 | if (!pte.is_present) | ||
241 | continue; | ||
242 | // If we find a matching PTE, return its virtual address | ||
243 | //if (pte.addr != 0x5555555) | ||
244 | // printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present"); | ||
245 | if (pte.addr << 12 == addr_to_find) | ||
246 | return i << NV_MMU_PT_V0_LSB | j << 12; | ||
247 | } | ||
248 | } while (++i < NV_MMU_PT_V0_SZ); | ||
249 | return 0; // No match | ||
250 | } | ||
251 | */ | ||