1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
|
/* Copyright 2024 Joshua Bakita
* Helpers to deal with NVIDIA's MMU and associated page tables
*/
#include <linux/err.h> // ERR_PTR() etc.
#include <linux/iommu.h> // iommu_get_domain_for_dev() and iommu_iova_to_phys()
#include <linux/kernel.h> // Kernel types
#include "nvdebug.h"
// Uncomment to print every PDE and PTE walked for debugging
//#define DEBUG
#ifdef DEBUG
#define printk_debug printk
#else
#define printk_debug(...)
#endif
/* Convert a page directory (PD) pointer and aperture to be kernel-accessible
I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the
AMDGPU driver.
@param addr Pointer from page directory entry (PDE)
@param pd_ap PD-type aperture (target address space) for `addr`
@return A dereferencable kernel address, or an ERR_PTR-wrapped error
*/
void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, enum PD_TARGET pd_ap) {
struct iommu_domain *dom;
phys_addr_t phys;
// Validate arguments
if (unlikely(!IS_PD_TARGET(pd_ap) || pd_ap == PD_AND_TARGET_INVALID || !addr))
return ERR_PTR(-EINVAL);
// VID_MEM accesses are the simple common-case
if (pd_ap == PD_AND_TARGET_VID_MEM) {
// Using BAR2 requires a page-table traversal. As this function is part
// of the page-table traversal process, it must instead use PRAMIN.
int off = addr_to_pramin_mut(g, addr, TARGET_VID_MEM);
if (off < 0)
return ERR_PTR(off);
return g->regs + NV_PRAMIN + off;
}
/* SYS_MEM accesses are rare. Only nvgpu (Jetson driver), nouveau, and this
* driver are known to create page directory entries in SYS_MEM.
*
* On systems using an I/O MMU, or some other I/O virtual address space,
* these are **not** physical addresses, and must first be translated
* through the I/O MMU before use.
* Example default meaning of a SYS_MEM address for a few CPUs:
* - Jetson Xavier : physical address
* - AMD 3950X : I/O MMU address
* - Phenom II x4 : physical address
*/
// Check for, and translate through, the I/O MMU (if any)
if ((dom = iommu_get_domain_for_dev(g->dev))) {
phys = iommu_iova_to_phys(dom, addr);
printk(KERN_ERR "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#lx to physical address %llx.\n", addr, phys);
} else
phys = addr;
if (!phys)
return 0;
return phys_to_virt(phys);
}
// Internal helper for search_page_directory().
uint64_t search_page_directory_subtree(struct nvdebug_state *g,
uintptr_t pde_addr,
enum PD_TARGET pde_target,
uint64_t addr_to_find,
uint32_t level) {
uint64_t res, i;
void __iomem *pde_kern;
page_dir_entry_t entry;
if (level > sizeof(NV_MMU_PT_V2_SZ))
return 0;
// Hack to workaround PDE0 being double-size and strangely formatted
if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
pde_addr += 8;
// Translate a VID_MEM/SYS_MEM-space address to something kernel-accessible
pde_kern = pd_deref(g, pde_addr, pde_target);
if (IS_ERR_OR_NULL(pde_kern)) {
printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_addr, pd_target_to_text(pde_target), PTR_ERR(pde_kern));
return 0;
}
// Read the page directory entry (a pointer to another directory, or a PTE)
entry.raw_w = readq(pde_kern);
// If we reached an invalid (unpopulated) PDE, walk back up the tree
if (entry.target == PD_AND_TARGET_INVALID)
return 0;
// Succeed when we reach a PTE with the address we want
if (entry.is_pte) {
// TODO: Handle huge pages here
printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w);
return (uint64_t)entry.addr << 12 == addr_to_find;
}
printk_debug(KERN_INFO "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w);
// Depth-first search of the page table
for (i = 0; i < NV_MMU_PT_V2_SZ[level + 1]; i++) {
uint64_t next = ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i;
res = search_page_directory_subtree(g, next, entry.target, addr_to_find, level + 1);
if (res)
return res | (i << NV_MMU_PT_V2_LSB[level + 1]);
}
return 0;
}
/* GPU Physical address -> Virtual address ("reverse" translation) for V2 tables
Depth-first search a page directory of the GPU MMU for where a particular
physical address is mapped. Upon finding a mapping, the virtual address is
returned.
The page directory may be located in VID_MEM, SYS_MEM, or some combination of
the two.
@param pd_config Page Directory configuration, containing pointer and
aperture for the start of the PDE3 entries
@param addr_to_find Physical address to reconstruct the virtual address of
@return 0 on error, otherwise the virtual address at which addr_to_find is
mapped into by this page table. (Zero is not a valid virtual address)
*/
uint64_t search_page_directory(struct nvdebug_state *g,
page_dir_config_t pd_config,
uint64_t addr_to_find) {
uint64_t res, i;
// Make sure that the query is page-aligned
if (addr_to_find & 0xfff) {
printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find);
return 0;
}
printk(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018lx\n", addr_to_find, (uintptr_t)pd_config.page_dir << 12);
// Search the top-level page directory (PDE3)
for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++)
if ((res = search_page_directory_subtree(g, ((uintptr_t)pd_config.page_dir << 12) + NV_MMU_PT_V2_ENTRY_SZ[0] * i, INST2PD_TARGET(pd_config.target), addr_to_find, 0)))
return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]);
return 0;
}
/* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables
(See `search_page_directory()` for documentation.)
*/
uint64_t search_v1_page_directory(struct nvdebug_state *g,
page_dir_config_t pd_config,
uint64_t addr_to_find) {
uint64_t j, i = 0;
page_dir_entry_v1_t pde;
page_tbl_entry_v1_t pte;
uintptr_t pte_offset, pde_offset;
void __iomem *pte_addr, *pde_addr;
// For each PDE
do {
// Index the list of page directory entries
pde_offset = ((uint64_t)pd_config.page_dir << 12) + i * sizeof(page_dir_entry_v1_t);
// Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
pde_addr = pd_deref(g, pde_offset, INST2PD_TARGET(pd_config.target));
if (IS_ERR_OR_NULL(pde_addr)) {
printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_offset, pd_target_to_text(INST2PD_TARGET(pd_config.target)), -PTR_ERR(pde_addr));
return 0;
}
// readq doesn't seem to work on BAR0
pde.raw = readl(pde_addr + 4);
pde.raw <<= 32;
pde.raw |= readl(pde_addr);
// Verify PDE is present
if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
continue;
// printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw);
printk_debug(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw);
// For each PTE
for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
// Index the list of page table entries starting at pde.alt_addr
pte_offset = ((uint64_t)pde.alt_addr << 12) + j * sizeof(page_tbl_entry_v1_t);
// Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
pte_addr = pd_deref(g, pte_offset, V12PD_TARGET(pde.alt_target));
if (IS_ERR_OR_NULL(pte_addr)) {
printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pte_offset, pd_target_to_text(V12PD_TARGET(pde.alt_target)), -PTR_ERR(pte_addr));
return 0;
}
// Read page table entry, avoiding readq
pte.raw = readl(pte_addr + 4);
pte.raw <<= 32;
pte.raw |= readl(pte_addr);
// Skip non-present PTEs
if (!pte.is_present)
continue;
printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw);
// If we find a matching PTE, return its virtual address
if ((uint64_t)pte.addr << 12 == addr_to_find)
return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1];
}
} while (++i < NV_MMU_PT_V1_SZ[0]);
return 0;
}
/* *** UNTESTED ***
#define NV_MMU_PT_V0_SZ 2048
#define NV_MMU_PT_V0_LSB 29
uint64_t search_v0_page_directory(struct nvdebug_state *g,
void __iomem *pde_offset,
void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t),
uint32_t addr_to_find) {
int j, i = 0;
page_dir_entry_v0_t pde;
page_tbl_entry_v0_t pte;
void __iomem *pte_offset;
// For each PDE
do {
// readq doesn't seem to work on BAR0
pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4);
pde.raw <<= 32;
pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t));
//if (pde.raw)
//printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw);
// Skip unpopulated PDEs
if (pde.type == NOT_PRESENT)
continue;
//printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12);
pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12);
// For each PTE
for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) {
pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4);
pte.raw <<= 32;
pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t));
// Skip non-present PTEs
if (!pte.is_present)
continue;
// If we find a matching PTE, return its virtual address
//if (pte.addr != 0x5555555)
// printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present");
if (pte.addr << 12 == addr_to_find)
return i << NV_MMU_PT_V0_LSB | j << 12;
}
} while (++i < NV_MMU_PT_V0_SZ);
return 0; // No match
}
*/
|