aboutsummaryrefslogtreecommitdiffstats
path: root/mmu.c
blob: 70c00f927fb4370bc048837de7623e7004242dd9 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
/* Copyright 2024 Joshua Bakita
 * Helpers to deal with NVIDIA's MMU and associated page tables
 */
#include <linux/err.h>  // ERR_PTR() etc.
#include <linux/iommu.h>  // iommu_get_domain_for_dev() and iommu_iova_to_phys()
#include <linux/kernel.h>  // Kernel types

#include "nvdebug.h"

// Uncomment to print every PDE and PTE walked for debugging
//#define DEBUG
#ifdef DEBUG
#define printk_debug printk
#else
#define printk_debug(...)
#endif

/* Convert a page directory (PD) pointer and aperture to be kernel-accessible

  I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the
  AMDGPU driver.

  @param addr  Pointer from page directory entry (PDE)
  @param pd_ap PD-type aperture (target address space) for `addr`
  @return A dereferencable kernel address, or an ERR_PTR-wrapped error
 */
void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, enum PD_TARGET pd_ap) {
	struct iommu_domain *dom;
	phys_addr_t phys;

	// Validate arguments
	if (unlikely(!IS_PD_TARGET(pd_ap) || pd_ap == PD_AND_TARGET_INVALID || !addr))
		return ERR_PTR(-EINVAL);

	// VID_MEM accesses are the simple common-case
	if (pd_ap == PD_AND_TARGET_VID_MEM) {
		// Using BAR2 requires a page-table traversal. As this function is part
		// of the page-table traversal process, it must instead use PRAMIN.
		int off = addr_to_pramin_mut(g, addr, TARGET_VID_MEM);
		if (off < 0)
			return ERR_PTR(off);
		return g->regs + NV_PRAMIN + off;
	}
	/* SYS_MEM accesses are rare. Only nvgpu (Jetson driver), nouveau, and this
	 * driver are known to create page directory entries in SYS_MEM.
	 *
	 * On systems using an I/O MMU, or some other I/O virtual address space,
	 * these are **not** physical addresses, and must first be translated
	 * through the I/O MMU before use.
	 * Example default meaning of a SYS_MEM address for a few CPUs:
	 * - Jetson Xavier : physical address
	 * - AMD 3950X     : I/O MMU address
	 * - Phenom II x4  : physical address
	 */
	// Check for, and translate through, the I/O MMU (if any)
	if ((dom = iommu_get_domain_for_dev(g->dev))) {
		phys = iommu_iova_to_phys(dom, addr);
		printk(KERN_ERR "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#lx to physical address %llx.\n", addr, phys);
	} else
		phys = addr;

	if (!phys)
		return 0;

	return phys_to_virt(addr);
}

// Internal helper for search_page_directory().
uint64_t search_page_directory_subtree(struct nvdebug_state *g,
				       uintptr_t pde_addr,
				       enum PD_TARGET pde_target,
				       uint64_t addr_to_find,
				       uint32_t level) {
	uint64_t res, i;
	void __iomem *pde_kern;
	page_dir_entry_t entry;
	if (level > sizeof(NV_MMU_PT_V2_SZ))
		return 0;
	// Hack to workaround PDE0 being double-size and strangely formatted
	if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
		pde_addr += 8;
	// Translate a VID_MEM/SYS_MEM-space address to something kernel-accessible
	pde_kern = pd_deref(g, pde_addr, pde_target);
	if (IS_ERR_OR_NULL(pde_kern)) {
		printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_addr, pd_target_to_text(pde_target), PTR_ERR(pde_kern));
		return 0;
	}
	// Read the page directory entry (a pointer to another directory, or a PTE)
	entry.raw_w = readq(pde_kern);
	// If we reached an invalid (unpopulated) PDE, walk back up the tree
	if (entry.target == PD_AND_TARGET_INVALID)
		return 0;
	// Succeed when we reach a PTE with the address we want
	if (entry.is_pte) {
		// TODO: Handle huge pages here
		printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w);
		return (uint64_t)entry.addr << 12 == addr_to_find;
	}
	printk_debug(KERN_INFO "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w);
	// Depth-first search of the page table
	for (i = 0; i < NV_MMU_PT_V2_SZ[level + 1]; i++) {
		uint64_t next = ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i;
		res = search_page_directory_subtree(g, next, entry.target, addr_to_find, level + 1);
		if (res)
			return res | (i << NV_MMU_PT_V2_LSB[level + 1]);
	}
	return 0;
}

/* GPU Physical address -> Virtual address ("reverse" translation) for V2 tables

  Depth-first search a page directory of the GPU MMU for where a particular
  physical address is mapped. Upon finding a mapping, the virtual address is
  returned.

  The page directory may be located in VID_MEM, SYS_MEM, or some combination of
  the two.

  @param pd_config    Page Directory configuration, containing pointer and
                      aperture for the start of the PDE3 entries
  @param addr_to_find Physical address to reconstruct the virtual address of
  @return 0 on error, otherwise the virtual address at which addr_to_find is
          mapped into by this page table. (Zero is not a valid virtual address)
*/
uint64_t search_page_directory(struct nvdebug_state *g,
			       page_dir_config_t pd_config,
			       uint64_t addr_to_find) {
	uint64_t res, i;
	// Make sure that the query is page-aligned
	if (addr_to_find & 0xfff) {
		printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find);
		return 0;
	}
	printk(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018lx\n", addr_to_find, (uintptr_t)pd_config.page_dir << 12);
	// Search the top-level page directory (PDE3)
	for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++)
		if ((res = search_page_directory_subtree(g, ((uintptr_t)pd_config.page_dir << 12) + NV_MMU_PT_V2_ENTRY_SZ[0] * i, INST2PD_TARGET(pd_config.target), addr_to_find, 0)))
			return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]);
	return 0;
}

/* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables
  (See `search_page_directory()` for documentation.)
 */
uint64_t search_v1_page_directory(struct nvdebug_state *g,
				  page_dir_config_t pd_config,
				  uint64_t addr_to_find) {
	uint64_t j, i = 0;
	page_dir_entry_v1_t pde;
	page_tbl_entry_v1_t pte;
	uintptr_t pte_offset, pde_offset;
	void __iomem *pte_addr, *pde_addr;
	// For each PDE
	do {
		// Index the list of page directory entries
		pde_offset = ((uint64_t)pd_config.page_dir << 12) + i * sizeof(page_dir_entry_v1_t);
		// Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
		pde_addr = pd_deref(g, pde_offset, INST2PD_TARGET(pd_config.target));
		if (IS_ERR_OR_NULL(pde_addr)) {
			printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_offset, pd_target_to_text(INST2PD_TARGET(pd_config.target)), -PTR_ERR(pde_addr));
			return 0;
		}
		// readq doesn't seem to work on BAR0
		pde.raw = readl(pde_addr + 4);
		pde.raw <<= 32;
		pde.raw |= readl(pde_addr);
		// Verify PDE is present
		if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
			continue;
//		printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw);
		printk_debug(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw);
		// For each PTE
		for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
			// Index the list of page table entries starting at pde.alt_addr
			pte_offset = ((uint64_t)pde.alt_addr << 12) + j * sizeof(page_tbl_entry_v1_t);
			// Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
			pte_addr = pd_deref(g, pte_offset, V12PD_TARGET(pde.alt_target));
			if (IS_ERR_OR_NULL(pte_addr)) {
				printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pte_offset, pd_target_to_text(V12PD_TARGET(pde.alt_target)), -PTR_ERR(pte_addr));
				return 0;
			}
			// Read page table entry, avoiding readq
			pte.raw = readl(pte_addr + 4);
			pte.raw <<= 32;
			pte.raw |= readl(pte_addr);
			// Skip non-present PTEs
			if (!pte.is_present)
				continue;
			printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw);
			// If we find a matching PTE, return its virtual address
			if ((uint64_t)pte.addr << 12 == addr_to_find)
				return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1];
		}
	} while (++i < NV_MMU_PT_V1_SZ[0]);
	return 0;
}

/* *** UNTESTED ***
#define NV_MMU_PT_V0_SZ 2048
#define NV_MMU_PT_V0_LSB 29
uint64_t search_v0_page_directory(struct nvdebug_state *g,
				  void __iomem *pde_offset,
				  void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t),
				  uint32_t addr_to_find) {
	int j, i = 0;
	page_dir_entry_v0_t pde;
	page_tbl_entry_v0_t pte;
	void __iomem *pte_offset;
	// For each PDE
	do {
		// readq doesn't seem to work on BAR0
		pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4);
		pde.raw <<= 32;
		pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t));
		//if (pde.raw)
		//printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw);
		// Skip unpopulated PDEs
		if (pde.type == NOT_PRESENT)
			continue;
		//printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12);
		pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12);
		// For each PTE
		for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) {
			pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4);
			pte.raw <<= 32;
			pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t));
			// Skip non-present PTEs
			if (!pte.is_present)
				continue;
			// If we find a matching PTE, return its virtual address
			//if (pte.addr != 0x5555555)
			//	printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present");
			if (pte.addr << 12 == addr_to_find)
				return i << NV_MMU_PT_V0_LSB | j << 12;
		}
	} while (++i < NV_MMU_PT_V0_SZ);
	return 0;  // No match
}
*/