mmu.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

// Helpers to deal with NVIDIA's MMU and associated page tables
#include <linux/kernel.h>  // Kernel types

#include "nvdebug.h"

/* One of the oldest ways to access video memory on NVIDIA GPUs is by using
  a configurable 1MB window into VRAM which is mapped into BAR0 (register)
  space starting at offset NV_PRAMIN. This is still supported on NVIDIA GPUs
  and appear to be used today to bootstrap page table configuration.

  Why is it mapped at a location called NVIDIA Private RAM Instance? Because
  this used to point to the entirety of intance RAM, which was seperate from
  VRAM on older NVIDIA GPUs.
*/

/* Convert a physical VRAM address to an offset in the PRAMIN window
  @param addr VRAM address to convert
  @return -errno on error, PRAMIN offset on success

  Note: Use off2PRAMIN() instead if you want a dereferenceable address
  Note: PRAMIN window is only 1MB, so returning an int is safe
*/
static int vram2PRAMIN(struct nvdebug_state *g, uint64_t addr) {
	uint64_t pramin_base_va;
	bar0_window_t window;
	window.raw = nvdebug_readl(g, NV_PBUS_BAR0_WINDOW);
	// Check if the address is valid (49 bits are addressable on-GPU)
	if (addr & ~0x0001ffffffffffff) {
		printk(KERN_ERR "[nvdebug] Invalid address %llx passed to %s!\n",
		       addr, __func__);
		return -EINVAL;
	}
	// For unclear (debugging?) reasons, PRAMIN can point to SYSMEM
	if (window.target != TARGET_VID_MEM)
		return -EFAULT;
	pramin_base_va = ((uint64_t)window.base) << 16;
	// Protect against out-of-bounds accesses
	if (addr < pramin_base_va || addr > pramin_base_va + NV_PRAMIN_LEN)
		return -ERANGE;
	return addr - pramin_base_va;
}

// Convert a GPU physical address to CPU virtual address via the PRAMIN window
// @return A dereferencable address, or 0 (an invalid physical address) on err
void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy) {
	int off = vram2PRAMIN(g, phy);
	if (off == -ERANGE)
		printk(KERN_ERR "[nvdebug] Page table walk off end of PRAMIN!\n");
	if (off < 0)
		return 0;
	return g->regs + NV_PRAMIN + vram2PRAMIN(g, phy);
}

/* FIXME
void __iomem *off2BAR2(struct nvdebug_state* g, uint32_t off) {
	return g->bar2 + off;
}
*/

// Internal helper for search_page_directory().
uint64_t search_page_directory_subtree(struct nvdebug_state *g,
				       void __iomem *pde_offset,
				       void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
				       uint64_t addr_to_find,
				       uint32_t level) {
	uint64_t res, i;
	void __iomem *next;
	page_dir_entry_t entry;
	if (level > sizeof(NV_MMU_PT_V2_SZ))
		return 0;
	// Hack to workaround PDE0 being double-size and strangely formatted
	if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
		pde_offset += 8;
	entry.raw_w = readq(pde_offset);
	// If we reached an invalid (unpopulated) PDE, walk back up the tree
	if (entry.target == PD_AND_TARGET_INVALID)
		return 0;
	// Succeed when we reach a PTE with the address we want
	if (entry.is_pte) {
		// TODO: Handle huge pages here
		printk(KERN_INFO "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w);
		return (uint64_t)entry.addr << 12 == addr_to_find;
	}
	printk(KERN_INFO "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w);
	// Depth-first search of the page table
	for (i = 0; i < NV_MMU_PT_V2_SZ[level + 1]; i++) {
		next = off2addr(g, ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i);
		// off2addr can fail
		if (!next || !entry.addr_w) {
			printk(KERN_ERR "[nvdebug] %s: Unable to resolve GPU PA to CPU PA\n", __func__);
			return 0;
		}
		res = search_page_directory_subtree(g, next, off2addr, addr_to_find, level + 1);
		if (res)
			return res | (i << NV_MMU_PT_V2_LSB[level + 1]);
	}
	return 0;
}

/* GPU Physical address -> Virtual address ("reverse" translation)

   Depth-first search a page directory of the GPU MMU for where a particular
   physical address is mapped. Upon finding a mapping, the virtual address is
   returned.

  @param pde_offset   Dereferenceable pointer to the start of the PDE3 entries
  @param off2addr     Func to convert VRAM phys addresses to valid CPU VAs
  @param addr_to_find Physical address to reconstruct the virtual address of
  @return 0 on error, otherwise the virtual address at which addr_to_find is
          mapped into by this page table. (Zero is not a valid virtual address)
*/
uint64_t search_page_directory(struct nvdebug_state *g,
			       void __iomem *pde_offset,
			       void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
			       uint64_t addr_to_find) {
	uint64_t res, i;
	// Make sure that the query is page-aligned
	if (addr_to_find & 0xfff) {
		printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find);
		return 0;
	}
	printk(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018llx\n", (u64)addr_to_find, (u64)pde_offset);
	// Search the top-level page directory (PDE3)
	for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++)
		if ((res = search_page_directory_subtree(g, pde_offset + NV_MMU_PT_V2_ENTRY_SZ[0] * i, off2addr, addr_to_find, 0)))
			return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]);
	return 0;
}

/* GMMU Page Tables Version 1
  This page table only contains 2 levels and is used in the Fermi, Kepler, and
  Maxwell architectures
*/
// Number of entries in the PDE and PTE levels
static const int NV_MMU_PT_V1_SZ[2] = {512, 1<<13};  // 2<<13 is an educated guess!!!
// Which bit index is the least significant in indexing each page level
static const int NV_MMU_PT_V1_LSB[2] = {25, 12};  // 25 is an educated guess!!!
uint64_t search_v1_page_directory(struct nvdebug_state *g,
				  void __iomem *pde_offset,
				  void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
				  uint64_t addr_to_find) {
	uint64_t j, i = 0;
	page_dir_entry_v1_t pde;
	page_tbl_entry_v1_t pte;
	void __iomem *pte_offset;
	// For each PDE
	do {
		// readq doesn't seem to work on BAR0
		pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v1_t) + 4);
		pde.raw <<= 32;
		pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v1_t));
		// Verify PDE is present
		if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
			continue;
		// Convert to a dereferencable pointer from CPU virtual address space
		pte_offset = off2addr(g, (uint64_t)pde.alt_addr << 12);
		if (!pte_offset)
			continue;
//		printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw);
//		printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.target, pde.raw);
		// For each PTE
		for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
			// Don't overrun the PRAMIN window
			if (pte_offset > NV_PRAMIN + g->regs + NV_PRAMIN_LEN)
				return 0;
			pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v1_t) + 4);
			pte.raw <<= 32;
			pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v1_t));
			// Skip non-present PTEs
			if (!pte.is_present)
				continue;
//			printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw);
			// If we find a matching PTE, return its virtual address
			if ((uint64_t)pte.addr << 12 == addr_to_find)
				return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1];
		}
	} while (++i < NV_MMU_PT_V1_SZ[0]);
	return 0;
}

/* GMMU Page Tables Version 0
  This page table only contains 2 levels and is used in the Tesla architecture
*/
/* *** UNTESTED ***
#define NV_MMU_PT_V0_SZ 2048
#define NV_MMU_PT_V0_LSB 29
uint64_t search_v0_page_directory(struct nvdebug_state *g,
				  void __iomem *pde_offset,
				  void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t),
				  uint32_t addr_to_find) {
	int j, i = 0;
	page_dir_entry_v0_t pde;
	page_tbl_entry_v0_t pte;
	void __iomem *pte_offset;
	// For each PDE
	do {
		// readq doesn't seem to work on BAR0
		pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4);
		pde.raw <<= 32;
		pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t));
		//if (pde.raw)
		//printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw);
		// Skip unpopulated PDEs
		if (pde.type == NOT_PRESENT)
			continue;
		//printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12);
		pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12);
		// For each PTE
		for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) {
			pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4);
			pte.raw <<= 32;
			pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t));
			// Skip non-present PTEs
			if (!pte.is_present)
				continue;
			// If we find a matching PTE, return its virtual address
			//if (pte.addr != 0x5555555)
			//	printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present");
			if (pte.addr << 12 == addr_to_find)
				return i << NV_MMU_PT_V0_LSB | j << 12;
		}
	} while (++i < NV_MMU_PT_V0_SZ);
	return 0;  // No match
}
*/