diff options
Diffstat (limited to 'mmu.c')
-rw-r--r-- | mmu.c | 206 |
1 files changed, 110 insertions, 96 deletions
@@ -1,117 +1,129 @@ | |||
1 | // Helpers to deal with NVIDIA's MMU and associated page tables | 1 | /* Copyright 2024 Joshua Bakita |
2 | * Helpers to deal with NVIDIA's MMU and associated page tables | ||
3 | */ | ||
4 | #include <linux/err.h> // ERR_PTR() etc. | ||
5 | #include <linux/iommu.h> // iommu_get_domain_for_dev() and iommu_iova_to_phys() | ||
2 | #include <linux/kernel.h> // Kernel types | 6 | #include <linux/kernel.h> // Kernel types |
3 | 7 | ||
4 | #include "nvdebug.h" | 8 | #include "nvdebug.h" |
5 | 9 | ||
6 | /* One of the oldest ways to access video memory on NVIDIA GPUs is by using | 10 | // Uncomment to print every PDE and PTE walked for debugging |
7 | a configurable 1MB window into VRAM which is mapped into BAR0 (register) | 11 | //#define DEBUG |
8 | space starting at offset NV_PRAMIN. This is still supported on NVIDIA GPUs | 12 | #ifdef DEBUG |
9 | and appear to be used today to bootstrap page table configuration. | 13 | #define printk_debug printk |
14 | #else | ||
15 | #define printk_debug(...) | ||
16 | #endif | ||
10 | 17 | ||
11 | Why is it mapped at a location called NVIDIA Private RAM Instance? Because | 18 | /* Convert a page directory (PD) pointer and aperture to be kernel-accessible |
12 | this used to point to the entirety of intance RAM, which was seperate from | ||
13 | VRAM on older NVIDIA GPUs. | ||
14 | */ | ||
15 | 19 | ||
16 | /* Convert a physical VRAM address to an offset in the PRAMIN window | 20 | I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the |
17 | @param addr VRAM address to convert | 21 | AMDGPU driver. |
18 | @return -errno on error, PRAMIN offset on success | ||
19 | 22 | ||
20 | Note: Use off2PRAMIN() instead if you want a dereferenceable address | 23 | @param addr Pointer from page directory entry (PDE) |
21 | Note: PRAMIN window is only 1MB, so returning an int is safe | 24 | @param pd_ap PD-type aperture (target address space) for `addr` |
22 | */ | 25 | @return A dereferencable kernel address, or an ERR_PTR-wrapped error |
23 | static int vram2PRAMIN(struct nvdebug_state *g, uint64_t addr) { | 26 | */ |
24 | uint64_t pramin_base_va; | 27 | void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, enum PD_TARGET pd_ap) { |
25 | bar0_window_t window; | 28 | struct iommu_domain *dom; |
26 | window.raw = nvdebug_readl(g, NV_PBUS_BAR0_WINDOW); | 29 | phys_addr_t phys; |
27 | // Check if the address is valid (49 bits are addressable on-GPU) | 30 | |
28 | if (addr & ~0x0001ffffffffffff) { | 31 | // Validate arguments |
29 | printk(KERN_ERR "[nvdebug] Invalid address %llx passed to %s!\n", | 32 | if (unlikely(!IS_PD_TARGET(pd_ap) || pd_ap == PD_AND_TARGET_INVALID || !addr)) |
30 | addr, __func__); | 33 | return ERR_PTR(-EINVAL); |
31 | return -EINVAL; | 34 | |
35 | // VID_MEM accesses are the simple common-case | ||
36 | if (pd_ap == PD_AND_TARGET_VID_MEM) { | ||
37 | // Using BAR2 requires a page-table traversal. As this function is part | ||
38 | // of the page-table traversal process, it must instead use PRAMIN. | ||
39 | int off = addr_to_pramin_mut(g, addr, TARGET_VID_MEM); | ||
40 | if (off < 0) | ||
41 | return ERR_PTR(off); | ||
42 | return g->regs + NV_PRAMIN + off; | ||
32 | } | 43 | } |
33 | // For unclear (debugging?) reasons, PRAMIN can point to SYSMEM | 44 | /* SYS_MEM accesses are rare. Only nvgpu (Jetson driver), nouveau, and this |
34 | if (window.target != TARGET_VID_MEM) | 45 | * driver are known to create page directory entries in SYS_MEM. |
35 | return -EFAULT; | 46 | * |
36 | pramin_base_va = ((uint64_t)window.base) << 16; | 47 | * On systems using an I/O MMU, or some other I/O virtual address space, |
37 | // Protect against out-of-bounds accesses | 48 | * these are **not** physical addresses, and must first be translated |
38 | if (addr < pramin_base_va || addr > pramin_base_va + NV_PRAMIN_LEN) | 49 | * through the I/O MMU before use. |
39 | return -ERANGE; | 50 | * Example default meaning of a SYS_MEM address for a few CPUs: |
40 | return addr - pramin_base_va; | 51 | * - Jetson Xavier : physical address |
41 | } | 52 | * - AMD 3950X : I/O MMU address |
53 | * - Phenom II x4 : physical address | ||
54 | */ | ||
55 | // Check for, and translate through, the I/O MMU (if any) | ||
56 | if ((dom = iommu_get_domain_for_dev(g->dev))) { | ||
57 | phys = iommu_iova_to_phys(dom, addr); | ||
58 | printk(KERN_ERR "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#lx to physical address %llx.\n", addr, phys); | ||
59 | } else | ||
60 | phys = addr; | ||
42 | 61 | ||
43 | // Convert a GPU physical address to CPU virtual address via the PRAMIN window | 62 | if (!phys) |
44 | // @return A dereferencable address, or 0 (an invalid physical address) on err | ||
45 | void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy) { | ||
46 | int off = vram2PRAMIN(g, phy); | ||
47 | if (off == -ERANGE) | ||
48 | printk(KERN_ERR "[nvdebug] Page table walk off end of PRAMIN!\n"); | ||
49 | if (off < 0) | ||
50 | return 0; | 63 | return 0; |
51 | return g->regs + NV_PRAMIN + vram2PRAMIN(g, phy); | ||
52 | } | ||
53 | 64 | ||
54 | /* FIXME | 65 | return phys_to_virt(addr); |
55 | void __iomem *off2BAR2(struct nvdebug_state* g, uint32_t off) { | ||
56 | return g->bar2 + off; | ||
57 | } | 66 | } |
58 | */ | ||
59 | 67 | ||
60 | // Internal helper for search_page_directory(). | 68 | // Internal helper for search_page_directory(). |
61 | uint64_t search_page_directory_subtree(struct nvdebug_state *g, | 69 | uint64_t search_page_directory_subtree(struct nvdebug_state *g, |
62 | void __iomem *pde_offset, | 70 | uintptr_t pde_addr, |
63 | void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), | 71 | enum PD_TARGET pde_target, |
64 | uint64_t addr_to_find, | 72 | uint64_t addr_to_find, |
65 | uint32_t level) { | 73 | uint32_t level) { |
66 | uint64_t res, i; | 74 | uint64_t res, i; |
67 | void __iomem *next; | 75 | void __iomem *pde_kern; |
68 | page_dir_entry_t entry; | 76 | page_dir_entry_t entry; |
69 | if (level > sizeof(NV_MMU_PT_V2_SZ)) | 77 | if (level > sizeof(NV_MMU_PT_V2_SZ)) |
70 | return 0; | 78 | return 0; |
71 | // Hack to workaround PDE0 being double-size and strangely formatted | 79 | // Hack to workaround PDE0 being double-size and strangely formatted |
72 | if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16) | 80 | if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16) |
73 | pde_offset += 8; | 81 | pde_addr += 8; |
74 | entry.raw_w = readq(pde_offset); | 82 | // Translate a VID_MEM/SYS_MEM-space address to something kernel-accessible |
83 | pde_kern = pd_deref(g, pde_addr, pde_target); | ||
84 | if (IS_ERR_OR_NULL(pde_kern)) { | ||
85 | printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_addr, pd_target_to_text(pde_target), PTR_ERR(pde_kern)); | ||
86 | return 0; | ||
87 | } | ||
88 | // Read the page directory entry (a pointer to another directory, or a PTE) | ||
89 | entry.raw_w = readq(pde_kern); | ||
75 | // If we reached an invalid (unpopulated) PDE, walk back up the tree | 90 | // If we reached an invalid (unpopulated) PDE, walk back up the tree |
76 | if (entry.target == PD_AND_TARGET_INVALID) | 91 | if (entry.target == PD_AND_TARGET_INVALID) |
77 | return 0; | 92 | return 0; |
78 | // Succeed when we reach a PTE with the address we want | 93 | // Succeed when we reach a PTE with the address we want |
79 | if (entry.is_pte) { | 94 | if (entry.is_pte) { |
80 | // TODO: Handle huge pages here | 95 | // TODO: Handle huge pages here |
81 | printk(KERN_INFO "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w); | 96 | printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w); |
82 | return (uint64_t)entry.addr << 12 == addr_to_find; | 97 | return (uint64_t)entry.addr << 12 == addr_to_find; |
83 | } | 98 | } |
84 | printk(KERN_INFO "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w); | 99 | printk_debug(KERN_INFO "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w); |
85 | // Depth-first search of the page table | 100 | // Depth-first search of the page table |
86 | for (i = 0; i < NV_MMU_PT_V2_SZ[level + 1]; i++) { | 101 | for (i = 0; i < NV_MMU_PT_V2_SZ[level + 1]; i++) { |
87 | next = off2addr(g, ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i); | 102 | uint64_t next = ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i; |
88 | // off2addr can fail | 103 | res = search_page_directory_subtree(g, next, entry.target, addr_to_find, level + 1); |
89 | if (!next || !entry.addr_w) { | ||
90 | printk(KERN_ERR "[nvdebug] %s: Unable to resolve GPU PA to CPU PA\n", __func__); | ||
91 | return 0; | ||
92 | } | ||
93 | res = search_page_directory_subtree(g, next, off2addr, addr_to_find, level + 1); | ||
94 | if (res) | 104 | if (res) |
95 | return res | (i << NV_MMU_PT_V2_LSB[level + 1]); | 105 | return res | (i << NV_MMU_PT_V2_LSB[level + 1]); |
96 | } | 106 | } |
97 | return 0; | 107 | return 0; |
98 | } | 108 | } |
99 | 109 | ||
100 | /* GPU Physical address -> Virtual address ("reverse" translation) | 110 | /* GPU Physical address -> Virtual address ("reverse" translation) for V2 tables |
111 | |||
112 | Depth-first search a page directory of the GPU MMU for where a particular | ||
113 | physical address is mapped. Upon finding a mapping, the virtual address is | ||
114 | returned. | ||
101 | 115 | ||
102 | Depth-first search a page directory of the GPU MMU for where a particular | 116 | The page directory may be located in VID_MEM, SYS_MEM, or some combination of |
103 | physical address is mapped. Upon finding a mapping, the virtual address is | 117 | the two. |
104 | returned. | ||
105 | 118 | ||
106 | @param pde_offset Dereferenceable pointer to the start of the PDE3 entries | 119 | @param pd_config Page Directory configuration, containing pointer and |
107 | @param off2addr Func to convert VRAM phys addresses to valid CPU VAs | 120 | aperture for the start of the PDE3 entries |
108 | @param addr_to_find Physical address to reconstruct the virtual address of | 121 | @param addr_to_find Physical address to reconstruct the virtual address of |
109 | @return 0 on error, otherwise the virtual address at which addr_to_find is | 122 | @return 0 on error, otherwise the virtual address at which addr_to_find is |
110 | mapped into by this page table. (Zero is not a valid virtual address) | 123 | mapped into by this page table. (Zero is not a valid virtual address) |
111 | */ | 124 | */ |
112 | uint64_t search_page_directory(struct nvdebug_state *g, | 125 | uint64_t search_page_directory(struct nvdebug_state *g, |
113 | void __iomem *pde_offset, | 126 | page_dir_config_t pd_config, |
114 | void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), | ||
115 | uint64_t addr_to_find) { | 127 | uint64_t addr_to_find) { |
116 | uint64_t res, i; | 128 | uint64_t res, i; |
117 | // Make sure that the query is page-aligned | 129 | // Make sure that the query is page-aligned |
@@ -119,57 +131,62 @@ uint64_t search_page_directory(struct nvdebug_state *g, | |||
119 | printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find); | 131 | printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find); |
120 | return 0; | 132 | return 0; |
121 | } | 133 | } |
122 | printk(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018llx\n", (u64)addr_to_find, (u64)pde_offset); | 134 | printk(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018lx\n", addr_to_find, (uintptr_t)pd_config.page_dir << 12); |
123 | // Search the top-level page directory (PDE3) | 135 | // Search the top-level page directory (PDE3) |
124 | for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++) | 136 | for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++) |
125 | if ((res = search_page_directory_subtree(g, pde_offset + NV_MMU_PT_V2_ENTRY_SZ[0] * i, off2addr, addr_to_find, 0))) | 137 | if ((res = search_page_directory_subtree(g, ((uintptr_t)pd_config.page_dir << 12) + NV_MMU_PT_V2_ENTRY_SZ[0] * i, INST2PD_TARGET(pd_config.target), addr_to_find, 0))) |
126 | return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]); | 138 | return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]); |
127 | return 0; | 139 | return 0; |
128 | } | 140 | } |
129 | 141 | ||
130 | /* GMMU Page Tables Version 1 | 142 | /* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables |
131 | This page table only contains 2 levels and is used in the Fermi, Kepler, and | 143 | (See `search_page_directory()` for documentation.) |
132 | Maxwell architectures | 144 | */ |
133 | */ | ||
134 | // Number of entries in the PDE and PTE levels | ||
135 | static const int NV_MMU_PT_V1_SZ[2] = {512, 1<<13}; // 2<<13 is an educated guess!!! | ||
136 | // Which bit index is the least significant in indexing each page level | ||
137 | static const int NV_MMU_PT_V1_LSB[2] = {25, 12}; // 25 is an educated guess!!! | ||
138 | uint64_t search_v1_page_directory(struct nvdebug_state *g, | 145 | uint64_t search_v1_page_directory(struct nvdebug_state *g, |
139 | void __iomem *pde_offset, | 146 | page_dir_config_t pd_config, |
140 | void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), | ||
141 | uint64_t addr_to_find) { | 147 | uint64_t addr_to_find) { |
142 | uint64_t j, i = 0; | 148 | uint64_t j, i = 0; |
143 | page_dir_entry_v1_t pde; | 149 | page_dir_entry_v1_t pde; |
144 | page_tbl_entry_v1_t pte; | 150 | page_tbl_entry_v1_t pte; |
145 | void __iomem *pte_offset; | 151 | uintptr_t pte_offset, pde_offset; |
152 | void __iomem *pte_addr, *pde_addr; | ||
146 | // For each PDE | 153 | // For each PDE |
147 | do { | 154 | do { |
155 | // Index the list of page directory entries | ||
156 | pde_offset = ((uint64_t)pd_config.page_dir << 12) + i * sizeof(page_dir_entry_v1_t); | ||
157 | // Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr | ||
158 | pde_addr = pd_deref(g, pde_offset, INST2PD_TARGET(pd_config.target)); | ||
159 | if (IS_ERR_OR_NULL(pde_addr)) { | ||
160 | printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_offset, pd_target_to_text(INST2PD_TARGET(pd_config.target)), -PTR_ERR(pde_addr)); | ||
161 | return 0; | ||
162 | } | ||
148 | // readq doesn't seem to work on BAR0 | 163 | // readq doesn't seem to work on BAR0 |
149 | pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v1_t) + 4); | 164 | pde.raw = readl(pde_addr + 4); |
150 | pde.raw <<= 32; | 165 | pde.raw <<= 32; |
151 | pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v1_t)); | 166 | pde.raw |= readl(pde_addr); |
152 | // Verify PDE is present | 167 | // Verify PDE is present |
153 | if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID) | 168 | if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID) |
154 | continue; | 169 | continue; |
155 | // Convert to a dereferencable pointer from CPU virtual address space | ||
156 | pte_offset = off2addr(g, (uint64_t)pde.alt_addr << 12); | ||
157 | if (!pte_offset) | ||
158 | continue; | ||
159 | // printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw); | 170 | // printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw); |
160 | // printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.target, pde.raw); | 171 | printk_debug(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw); |
161 | // For each PTE | 172 | // For each PTE |
162 | for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) { | 173 | for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) { |
163 | // Don't overrun the PRAMIN window | 174 | // Index the list of page table entries starting at pde.alt_addr |
164 | if (pte_offset > NV_PRAMIN + g->regs + NV_PRAMIN_LEN) | 175 | pte_offset = ((uint64_t)pde.alt_addr << 12) + j * sizeof(page_tbl_entry_v1_t); |
176 | // Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr | ||
177 | pte_addr = pd_deref(g, pte_offset, V12PD_TARGET(pde.alt_target)); | ||
178 | if (IS_ERR_OR_NULL(pte_addr)) { | ||
179 | printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pte_offset, pd_target_to_text(V12PD_TARGET(pde.alt_target)), -PTR_ERR(pte_addr)); | ||
165 | return 0; | 180 | return 0; |
166 | pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v1_t) + 4); | 181 | } |
182 | // Read page table entry, avoiding readq | ||
183 | pte.raw = readl(pte_addr + 4); | ||
167 | pte.raw <<= 32; | 184 | pte.raw <<= 32; |
168 | pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v1_t)); | 185 | pte.raw |= readl(pte_addr); |
169 | // Skip non-present PTEs | 186 | // Skip non-present PTEs |
170 | if (!pte.is_present) | 187 | if (!pte.is_present) |
171 | continue; | 188 | continue; |
172 | // printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw); | 189 | printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw); |
173 | // If we find a matching PTE, return its virtual address | 190 | // If we find a matching PTE, return its virtual address |
174 | if ((uint64_t)pte.addr << 12 == addr_to_find) | 191 | if ((uint64_t)pte.addr << 12 == addr_to_find) |
175 | return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1]; | 192 | return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1]; |
@@ -178,9 +195,6 @@ uint64_t search_v1_page_directory(struct nvdebug_state *g, | |||
178 | return 0; | 195 | return 0; |
179 | } | 196 | } |
180 | 197 | ||
181 | /* GMMU Page Tables Version 0 | ||
182 | This page table only contains 2 levels and is used in the Tesla architecture | ||
183 | */ | ||
184 | /* *** UNTESTED *** | 198 | /* *** UNTESTED *** |
185 | #define NV_MMU_PT_V0_SZ 2048 | 199 | #define NV_MMU_PT_V0_SZ 2048 |
186 | #define NV_MMU_PT_V0_LSB 29 | 200 | #define NV_MMU_PT_V0_LSB 29 |