aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2025-05-05 03:53:01 -0400
committerJoshua Bakita <bakitajoshua@gmail.com>2025-05-05 03:53:13 -0400
commit293430fcb5d4013b573556c58457ee706e482b7f (patch)
tree9328fa680f55b4e1a08d24714275b8437be3be5d
parent494df296bf4abe9b2b484bde1a4fad28c989afec (diff)
Snapshot for ECRTS'25 artifact evaluation
-rw-r--r--Makefile3
-rw-r--r--README.md10
-rw-r--r--device_info_procfs.c79
-rw-r--r--mmu.c414
-rw-r--r--nvdebug.h293
-rw-r--r--nvdebug_entry.c476
-rw-r--r--nvdebug_linux.h5
-rw-r--r--runlist.c275
-rw-r--r--runlist_procfs.c645
9 files changed, 2154 insertions, 46 deletions
diff --git a/Makefile b/Makefile
index fea3819..9d6d374 100644
--- a/Makefile
+++ b/Makefile
@@ -8,3 +8,6 @@ all:
8 make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules 8 make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
9clean: 9clean:
10 make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean 10 make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
11
12nvdebug_user.so: runlist.c mmu.c bus.c nvdebug_user.c
13 gcc $< -shared -o $@ $(KBULID_CFLAGS)
diff --git a/README.md b/README.md
index da3e5d7..2889b29 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,7 @@ Not all these TPCs will necessarially be enabled in every GPC.
59Use `cat gpcX_tpc_mask` to get a bit mask of which TPCs are disabled for GPC X. 59Use `cat gpcX_tpc_mask` to get a bit mask of which TPCs are disabled for GPC X.
60A set bit indicates a disabled TPC. 60A set bit indicates a disabled TPC.
61This API is only available on enabled GPCs. 61This API is only available on enabled GPCs.
62Bits greater than the number of on-chip TPCs per GPC should be ignored (it may appear than non-existent TPCs are "disabled").
62 63
63Example usage: To get the number of on-chip SMs on Volta+ GPUs, multiply the return of `cat num_gpcs` with `cat num_tpc_per_gpc` and multiply by 2 (SMs per TPC). 64Example usage: To get the number of on-chip SMs on Volta+ GPUs, multiply the return of `cat num_gpcs` with `cat num_tpc_per_gpc` and multiply by 2 (SMs per TPC).
64 65
@@ -83,6 +84,13 @@ Use `echo Z > runlistY/switch_to_tsg` to switch the GPU to run only the specifie
83 84
84Use `echo Y > resubmit_runlist` to resubmit runlist Y (useful to prompt newer GPUs to pick up on re-enabled channels). 85Use `echo Y > resubmit_runlist` to resubmit runlist Y (useful to prompt newer GPUs to pick up on re-enabled channels).
85 86
87## Error Interpretation
88First check the kernel log to see if in includes more information about the error.
89The following conventions are used for certain error codes:
90
91- EIO, "Input/Output Error," is returned when an operation fails due to a bad register read.
92- (Other errors may not have a consistent conventional meaning; see the implementation.)
93
86## General Codebase Structure 94## General Codebase Structure
87- `nvdebug.h` defines and describes all GPU data structures. This does not depend on any kernel-internal headers. 95- `nvdebug.h` defines and describes all GPU data structures. This does not depend on any kernel-internal headers.
88- `nvdebug_entry.h` contains module startup, device detection, initialization, and module teardown logic. 96- `nvdebug_entry.h` contains module startup, device detection, initialization, and module teardown logic.
@@ -94,4 +102,4 @@ Use `echo Y > resubmit_runlist` to resubmit runlist Y (useful to prompt newer GP
94 102
95- The runlist-printing API does not work when runlist management is delegated to the GPU System Processor (GSP) (most Turing+ datacenter GPUs). 103- The runlist-printing API does not work when runlist management is delegated to the GPU System Processor (GSP) (most Turing+ datacenter GPUs).
96 To workaround, enable the `FALLBACK_TO_PRAMIN` define in `runlist.c`, or reload the `nvidia` kernel module with the `NVreg_EnableGpuFirmware=0` parameter setting. 104 To workaround, enable the `FALLBACK_TO_PRAMIN` define in `runlist.c`, or reload the `nvidia` kernel module with the `NVreg_EnableGpuFirmware=0` parameter setting.
97 (Eg. on A100: end all GPU-using processes, then `sudo rmmod nvidia_uvm nvidia; sudo modprobe nvidia NVreg_EnableGpuFirmware=0`.) 105 (Eg. on A100: end all GPU-using processes, then `sudo rmmod nvidia_drm nvidia_modeset nvidia_uvm nvidia; sudo modprobe nvidia NVreg_EnableGpuFirmware=0`.)
diff --git a/device_info_procfs.c b/device_info_procfs.c
index 4e4ab03..105e731 100644
--- a/device_info_procfs.c
+++ b/device_info_procfs.c
@@ -18,7 +18,7 @@ static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size,
18 return 0; 18 return 0;
19 19
20 if ((read = nvdebug_readl(g, (uintptr_t)pde_data(file_inode(f)))) == -1) 20 if ((read = nvdebug_readl(g, (uintptr_t)pde_data(file_inode(f)))) == -1)
21 return -EOPNOTSUPP; 21 return -EIO;
22 // 32 bit register will always take less than 16 characters to print 22 // 32 bit register will always take less than 16 characters to print
23 chars_written = scnprintf(out, 16, "%#0x\n", read); 23 chars_written = scnprintf(out, 16, "%#0x\n", read);
24 if (copy_to_user(buf, out, chars_written)) 24 if (copy_to_user(buf, out, chars_written))
@@ -32,12 +32,85 @@ struct file_operations nvdebug_read_reg32_file_ops = {
32 .llseek = default_llseek, 32 .llseek = default_llseek,
33}; 33};
34 34
35typedef union {
36 struct {
37 uint8_t partitioning_select:2;
38 uint8_t table_select:2;
39 uint32_t pad_1:12;
40 uint8_t veid_offset:6;
41 uint32_t pad_2:2;
42 uint8_t table_offset:6;
43 uint32_t pad_3:2;
44 };
45 uint32_t raw;
46} partition_ctl_t;
47
48static ssize_t nvdebug_read_part(struct file *f, char __user *buf, size_t size, loff_t *off) {
49 char out[12*64+2];
50 int i, chars_written = 0;
51 partition_ctl_t part_ctl;
52 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
53 if (size < 16 || *off != 0)
54 return 0;
55 // 32 bit register will always take less than 16 characters to print
56 part_ctl.raw = nvdebug_readl(g, 0x00405b2c);
57 //part_ctl.partitioning_select = 0; // XXX XXX XXX Temp; 06/18/2024
58 //part_ctl.table_select = 3; // 3 == ???
59 //part_ctl.table_select = 2; // 2 == TBL_SEL_PARTITIONING_LMEM_BLK
60 part_ctl.table_select = 1; // 1 == TBL_SEL_PARTITIONING_ENABLE
61 //part_ctl.table_select = 0; // 0 == TBL_SEL_NONE
62 part_ctl.veid_offset = (uintptr_t)pde_data(file_inode(f)); // Range of [0, 0x3f], aka [0, 63]
63 for (i = 0; i < 64; i++) {
64 // Increment to next table offset in PARTITION_CTL
65 part_ctl.table_offset = i;
66 nvdebug_writel(g, 0x00405b2c, part_ctl.raw);
67 // Verify write applied to PARTITION_CTL
68 part_ctl.raw = nvdebug_readl(g, 0x00405b2c);
69 if (part_ctl.table_offset != i)
70 return -ENOTRECOVERABLE;
71 // Read PARTITION_DATA and print
72 // ---
73 // I get back 0x000000ff on Volta and 0x00000003 on Turing from
74 // PARTITION_DATA for all possible VEID_OFFSET, TBL_OFFSET, and TBL_SEL
75 // combinations.
76 // ---
77 // There's a 48-byte (12-word) gap after the address for PARTITION_DATA.
78 // Exploring this on Turing for TBL_SEL_PARTITIONING_ENABLE, VEID 1, 62, and
79 // 63, with CUDA_MPS_ACTIVE_THREAD_PERCENTAGE=5 for constant_cycles_kernel
80 // running under MPS:
81 // +0x0: 0x3
82 // +0x4: 0
83 // +0x8: 0x100
84 // +0xC: 0
85 // +0x10: 0xffffffff
86 // +0x14: 0
87 // +0x18: 0
88 // +0x1C: 0xffffffff
89 // +0x20: 0
90 // +0x24: 0xffffffff
91 // +0x28: 0xffffffff
92 // +0x2C: 0xffffffff
93 chars_written += scnprintf(out + chars_written, 12, "%#010x ", nvdebug_readl(g, 0x00405b30));
94 }
95 chars_written += scnprintf(out + chars_written, 2, "\n");
96 if (copy_to_user(buf, out, chars_written))
97 printk(KERN_WARNING "Unable to copy all data for %s\n", file_dentry(f)->d_name.name);
98 *off += chars_written;
99 return chars_written;
100}
101
102struct file_operations nvdebug_read_part_file_ops = {
103 .read = nvdebug_read_part,
104 .llseek = default_llseek,
105};
106
35static ssize_t nvdebug_reg_range_read(struct file *f, char __user *buf, size_t size, loff_t *off) { 107static ssize_t nvdebug_reg_range_read(struct file *f, char __user *buf, size_t size, loff_t *off) {
36 char out[12]; 108 char out[12];
37 int chars_written; 109 int chars_written;
38 uint32_t read, mask; 110 uint32_t read, mask;
39 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; 111 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
40 // See comment in nvdebug_entry.c to understand `union reg_range` 112 // `start_bit` is included, `stop_bit` is not, so to print lower eight bits
113 // from a register, use `start_bit = 0` and `stop_bit = 8`.
41 union reg_range range; 114 union reg_range range;
42 range.raw = (uintptr_t)pde_data(file_inode(f)); 115 range.raw = (uintptr_t)pde_data(file_inode(f));
43 116
@@ -47,7 +120,7 @@ static ssize_t nvdebug_reg_range_read(struct file *f, char __user *buf, size_t s
47 120
48 // Print bits `start_bit` to `stop_bit` from 32 bits at address `offset` 121 // Print bits `start_bit` to `stop_bit` from 32 bits at address `offset`
49 if ((read = nvdebug_readl(g, range.offset)) == -1) 122 if ((read = nvdebug_readl(g, range.offset)) == -1)
50 return -EOPNOTSUPP; 123 return -EIO;
51 // Setup `mask` used to throw out unused upper bits 124 // Setup `mask` used to throw out unused upper bits
52 mask = -1u >> (32 - range.stop_bit + range.start_bit); 125 mask = -1u >> (32 - range.stop_bit + range.start_bit);
53 // Throw out unused lower bits via a shift, apply the mask, and print 126 // Throw out unused lower bits via a shift, apply the mask, and print
diff --git a/mmu.c b/mmu.c
index ababef5..e2b9a91 100644
--- a/mmu.c
+++ b/mmu.c
@@ -1,9 +1,13 @@
1/* Copyright 2024 Joshua Bakita 1/* Copyright 2024 Joshua Bakita
2 * Helpers to deal with NVIDIA's MMU and associated page tables 2 * Helpers to deal with NVIDIA's MMU and associated page tables
3 */ 3 */
4#include <linux/dma-mapping.h> // dma_map_page() and dma_unmap_page()
4#include <linux/err.h> // ERR_PTR() etc. 5#include <linux/err.h> // ERR_PTR() etc.
6#include <linux/gfp.h> // alloc_pages()
5#include <linux/iommu.h> // iommu_get_domain_for_dev() and iommu_iova_to_phys() 7#include <linux/iommu.h> // iommu_get_domain_for_dev() and iommu_iova_to_phys()
6#include <linux/kernel.h> // Kernel types 8#include <linux/kernel.h> // Kernel types
9#include <linux/list.h> // struct list_head and associated functions
10#include <linux/mm.h> // put_page()
7 11
8#include "nvdebug.h" 12#include "nvdebug.h"
9 13
@@ -15,6 +19,11 @@ int g_verbose = 0;
15#define printk_debug if (g_verbose >= 2) printk 19#define printk_debug if (g_verbose >= 2) printk
16#define printk_info if (g_verbose >= 1) printk 20#define printk_info if (g_verbose >= 1) printk
17 21
22// At least map_page_directory() assumes that pages are 4 KiB
23#if PAGE_SIZE != 4096
24#error nvdebug assumes and requires a 4 KiB page size.
25#endif
26
18/* Convert a page directory (PD) pointer and aperture to be kernel-accessible 27/* Convert a page directory (PD) pointer and aperture to be kernel-accessible
19 28
20 I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the 29 I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the
@@ -22,7 +31,8 @@ int g_verbose = 0;
22 31
23 @param addr Pointer from page directory entry (PDE) 32 @param addr Pointer from page directory entry (PDE)
24 @param pd_ap PD-type aperture (target address space) for `addr` 33 @param pd_ap PD-type aperture (target address space) for `addr`
25 @return A dereferencable kernel address, or an ERR_PTR-wrapped error 34 @return A dereferencable kernel address, 0 if an I/O MMU is in use and has
35 no available mapping for the bus address, or an ERR_PTR-wrapped error
26 */ 36 */
27static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, 37static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr,
28 enum PD_TARGET pd_ap) { 38 enum PD_TARGET pd_ap) {
@@ -56,7 +66,7 @@ static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr,
56 // Check for, and translate through, the I/O MMU (if any) 66 // Check for, and translate through, the I/O MMU (if any)
57 if ((dom = iommu_get_domain_for_dev(g->dev))) { 67 if ((dom = iommu_get_domain_for_dev(g->dev))) {
58 phys = iommu_iova_to_phys(dom, addr); 68 phys = iommu_iova_to_phys(dom, addr);
59 printk_debug(KERN_DEBUG "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#lx to physical address %#llx.\n", addr, phys); 69 printk_debug(KERN_DEBUG "[nvdebug] %s: I/O MMU translated SYS_MEM I/O VA %#lx to physical address %#llx.\n", __func__, addr, phys);
60 } else 70 } else
61 phys = addr; 71 phys = addr;
62 72
@@ -143,6 +153,327 @@ uint64_t search_page_directory(struct nvdebug_state *g,
143 return 0; 153 return 0;
144} 154}
145 155
156/* GPU Virtual address -> Physical address ("forward" translation) for V2 tables
157 Index the page directories and tables used by the GPU MMU to determine which
158 physical address a given GPU virtual address has been mapped to.
159
160 The page directory and tables may be located in VID_MEM, SYS_MEM, or spread
161 across multiple apertures.
162
163 @param pd_config Page Directory configuration, containing pointer and
164 aperture for the start of the PDE3 entries
165 @param addr_to_find Virtual address to translate to a physical address
166 @param found_addr Where to store found physical address (0 if unfound)
167 @param found_aperture Where to store aperture of found physical address
168 @return 0 on success, -ENXIO if not found, and -errno on error.
169*/
170int translate_page_directory(struct nvdebug_state *g,
171 page_dir_config_t pd_config,
172 uint64_t addr_to_find,
173 uint64_t *found_addr /* out */,
174 enum INST_TARGET *found_aperture /* out */) {
175 page_dir_entry_t entry;
176 void __iomem *next_kva;
177 unsigned int level, pde_idx;
178 uintptr_t next = (uintptr_t)pd_config.page_dir << 12;
179 enum PD_TARGET next_target = INST2PD_TARGET(pd_config.target);
180
181 *found_addr = 0;
182 *found_aperture = TARGET_INVALID;
183
184 // Make sure that the query is page-aligned (likely mistake otherwise)
185 if (addr_to_find & 0xfff) {
186 printk(KERN_WARNING "[nvdebug] Attempting to translate unaligned address %#llx in translate_page_directory()!\n", addr_to_find);
187 return -EINVAL;
188 }
189
190 printk_info(KERN_INFO "[nvdebug] Translating addr %#018llx in V2 page table with base %#018llx\n", (u64)addr_to_find, (u64)next);
191
192 // Step through each PDE level and the PTE level
193 for (level = 0; level < 5; level++) {
194 // Index into this level
195 pde_idx = (addr_to_find >> NV_MMU_PT_V2_LSB[level]) & (NV_MMU_PT_V2_SZ[level] - 1);
196 printk_debug(KERN_DEBUG "[nvdebug] Using index %u in lvl %d\n", pde_idx, level);
197 // Hack to workaround PDE0 being double-size and strangely formatted
198 if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
199 next += 8;
200 // Obtain a kernel-dereferencable address
201 next_kva = pd_deref(g, next, next_target);
202 if (IS_ERR_OR_NULL(next_kva)) {
203 printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, next, pd_target_to_text(next_target), PTR_ERR(next_kva));
204 return PTR_ERR(next_kva);
205 }
206 // Obtain entry at this level
207 entry.raw_w = readq(next_kva + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx);
208 if (entry.target == PD_AND_TARGET_INVALID)
209 return -ENXIO;
210 printk_debug(KERN_DEBUG "[nvdebug] Found %s pointing to %#018llx in ap '%s' at lvl %d (raw: %#018llx)\n", entry.is_pte ? "PTE" : "PDE", ((u64)entry.addr) << 12, pd_target_to_text(entry.target), level, entry.raw_w);
211 // Just return the physical address if this is the PTE level
212 if (entry.is_pte) { // level == 4 for 4 KiB pages, == 3 for 2 MiB
213 *found_addr = ((uint64_t)entry.addr) << 12;
214 *found_aperture = entry.aperture;
215 return 0;
216 }
217 // Otherwise step to the next table level
218 // TODO: Use addr_w as appropriate
219 next = (uint64_t)entry.addr << 12;
220 next_target = entry.target;
221 }
222
223 return 0;
224}
225
226// This struct is very special. We will never directly allocate this struct;
227// its sole purpose is to provide more intuitive names to the offsets at which
228// we store data in Linux's struct page. Such (ab)use of struct page is
229// explictly permitted (see linux/mm_types.h). This struct is thus used by
230// casting a pointer of struct page to a pointer of struct nvdebug_pd_page,
231// then accessing the associated fields. This pointer may also be freely cast
232// back to a sturct page pointer.
233// We have 24 (32-bit) or 44 (64-bit) bytes available in the page struct
234// (according to the documentation on struct page). Our comments indicate what
235// available parts of struct page we repurpose for our own needs.
236struct nvdebug_pd_page {
237 unsigned long __flags; // From struct page; do not touch!
238 // Overlaps struct page.lru
239 struct list_head list; // 4/8 bytes
240 // Overlaps struct page.mapping (and page.share on 32-bit)
241 uintptr_t parent_addr; // 8 bytes
242 // Overlaps struct page.share (page.private on 32-bit)
243 enum PD_TARGET parent_aperture; // 4 bytes
244 // Overlaps page.private (page.page_type on 32-bit)
245 dma_addr_t dma_addr; // 4/8 bytes
246};
247
248/* Collect and free any now-unused page directory/table allocations
249
250 @param force Deallocate all page directories/tables created by this module,
251 no matter if they appear to be in-use or not.
252 @returns Number of freed pages on success, -errno on error.
253*/
254int gc_page_directory(struct nvdebug_state *g, bool force) {
255 struct nvdebug_pd_page *page, *_page;
256 void __iomem *parent_kva;
257 page_dir_entry_t parent_entry;
258 int freed_pages = 0;
259
260 // Depth-first traversal (from perspective of each page table) of page
261 // allocations.
262 // (This is depth-first because map_page_directory() always allocates and
263 // pushes page directory allocations before page table allocations.)
264 list_for_each_entry_safe_reverse(page, _page, &g->pd_allocs, list) {
265 printk_debug(KERN_DEBUG "[nvdebug] %s: Checking if page directory/table at %llx (SYS_MEM_?) with parent at %lx (%s) is unused...\n", __func__, page->dma_addr, page->parent_addr, pd_target_to_text(page->parent_aperture));
266 // Try to determine if we're still in-use. We consider ourselves
267 // potentially in-use if our parent still points to us.
268 parent_kva = pd_deref(g, page->parent_addr, page->parent_aperture);
269 if (IS_ERR(parent_kva)) {
270 printk(KERN_ERR "[nvdebug] %s: Error resolving %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, page->parent_addr, pd_target_to_text(page->parent_aperture), PTR_ERR(parent_kva));
271 return -ENOTRECOVERABLE;
272 }
273 // A NULL kva indicates parent no longer exists
274 parent_entry.raw_w = parent_kva ? readq(parent_kva) : 0;
275 // Page directory/table still in-use; do not free unless forced
276 if (parent_entry.addr_w == (page->dma_addr >> 12) && !force)
277 continue;
278 // Free this page table/directory and delete our parent's pointer to us
279 if (parent_entry.addr_w == (page->dma_addr >> 12)) {
280 printk(KERN_WARNING "[nvdebug] %s: Deleting page table/directory at %llx (SYS_MEM_?) with parent at %lx (%s) that may still be in-use!\n", __func__, page->dma_addr, page->parent_addr, pd_target_to_text(page->parent_aperture));
281 writeq(0, parent_kva);
282 }
283 // Unmap, zero, free, and remove from tracking (these all return void)
284 dma_unmap_page(g->dev, page->dma_addr, PAGE_SIZE, DMA_TO_DEVICE);
285 memset(page_to_virt((struct page*)page), 0, PAGE_SIZE);
286 // Necessary to reset mapcount as we (ab)use its state for other things
287 page_mapcount_reset((struct page*)page);
288 // Same reset needed for mapping
289 ((struct page*)page)->mapping = NULL;
290 // Remove this page from our list of allocated pages
291 list_del(&page->list);
292 // Free the page
293 put_page((struct page*)page);
294 freed_pages++;
295 }
296 printk_debug(KERN_DEBUG "[nvdebug] %s: Freed %d pages.", __func__, freed_pages);
297 return freed_pages;
298}
299
300/* Map a GPU virtual address to a physical address in a GPU page table
301 Search for a mapping for specified GPU virtual address, and create a new one
302 if none is found. Automatically creates page directories and page table
303 entries as necessary.
304
305 The page directory and tables may be located in VID_MEM, SYS_MEM, or spread
306 across multiple apertures.
307
308 @param pd_config Page Directory configuration, containing pointer and
309 aperture for the start of the PDE3 entries
310 @param vaddr_to_find Virtual address to check, and map to a physical address
311 if nothing is already mapped (up to 49 bits long)
312 @param paddr_to_map Physical address to use (up to 36 bits long if VID_MEM,
313 and up to 58 bits if SYS_MEM)
314 @param paddr_target Which space does the physical address refer to?
315 @param huge_page Set to map a 2 MiB, rather than 4 KiB, page
316 @return 0 on success, 1 if mapping already exists, -EADDRINUSE if virtual
317 address is already mapped to something else, and -errno on error
318*/
319int map_page_directory(struct nvdebug_state *g,
320 page_dir_config_t pd_config,
321 uint64_t vaddr_to_find,
322 uint64_t paddr_to_map,
323 enum INST_TARGET paddr_target,
324 bool huge_page) {
325 page_dir_entry_t entry;
326 void __iomem *next_kva;
327 unsigned int level, pde_idx;
328 uintptr_t next = (uintptr_t)pd_config.page_dir << 12;
329 enum PD_TARGET next_target = INST2PD_TARGET(pd_config.target);
330
331 // Make sure that the query is page-aligned (likely mistake otherwise)
332 if ((vaddr_to_find & 0xfff || paddr_to_map & 0xfff)
333 || (huge_page && (vaddr_to_find & 0x1fffff || paddr_to_map & 0x1fffff))) {
334 printk(KERN_WARNING "[nvdebug] %s: Attempting to map an unaligned address (physical %#018llx or virtual %#018llx)! Failing...\n", __func__, paddr_to_map, vaddr_to_find);
335 return -EINVAL;
336 }
337
338 // NVIDIA supports up to 49-bit virtual addresss
339 // Except Jetson Xavier only seems to be able to resolve 47-bit addresses?
340 if (vaddr_to_find >> 49) {
341 printk(KERN_WARNING "[nvdebug] %s: vaddr_to_find (%#018llx) is beyond the 49-bit virtual address space supported by the GPU! Failing...\n", __func__, vaddr_to_find);
342 return -EINVAL;
343 }
344
345 // NVIDIA supports up to 36-bit VID_MEM addresses
346 if (paddr_target == TARGET_VID_MEM && paddr_to_map >> 36) {
347 printk(KERN_WARNING "[nvdebug] %s: paddr_to_map (%#018llx) is beyond the 36-bit VID_MEM address space! Failing...\n", __func__, paddr_to_map);
348 return -EINVAL;
349 }
350
351 // NVIDIA supports up to 58-bit SYS_MEM addresses
352 if ((paddr_target == TARGET_SYS_MEM_COHERENT ||
353 paddr_target == TARGET_SYS_MEM_NONCOHERENT) && paddr_to_map >> 58) {
354 printk(KERN_WARNING "[nvdebug] %s: paddr_to_map (%#018llx) is beyond the 58-bit SYS_MEM address space! Failing...\n", __func__, paddr_to_map);
355 return -EINVAL;
356 }
357
358 // We don't support mapping to PEERs; that requires a PEER ID
359 if (paddr_target == TARGET_PEER) {
360 printk(KERN_WARNING "[nvdebug] %s: paddr_target must be SYS_MEM_* or VID_MEM! Failing...\n", __func__);
361 return -EINVAL;
362 }
363
364 printk_info(KERN_INFO "[nvdebug] Mapping addr %#018llx in page table with base %#018llx to %s address %#018llx\n", vaddr_to_find, (u64)next, target_to_text(paddr_target), paddr_to_map);
365
366 // Step through each PDE level and the PTE level
367 for (level = 0; level < 5; level++) {
368 // Index into this level
369 pde_idx = (vaddr_to_find >> NV_MMU_PT_V2_LSB[level]) & (NV_MMU_PT_V2_SZ[level] - 1);
370 printk_debug(KERN_DEBUG "[nvdebug] In table at KVA %#lx, using index %u in lvl %d\n", (uintptr_t)next, pde_idx, level);
371 // Hack to workaround PDE0 being double-size and strangely formatted
372 if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
373 next += 8;
374 // Obtain a kernel-dereferencable address
375 next_kva = pd_deref(g, next, next_target);
376 if (IS_ERR_OR_NULL(next_kva)) {
377 printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, next, pd_target_to_text(next_target), PTR_ERR(next_kva));
378 return -ENOTRECOVERABLE;
379 }
380 // Obtain entry at this level
381 entry.raw_w = readq(next_kva + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx);
382 // If pointer to next level of the table does not exist
383 if (entry.target == PD_AND_TARGET_INVALID) { // PTE or PD covered by PD_AND_TARGET_INVALID
384 if (level == 4 || (huge_page && level == 3)) {
385 // Create new PTE (allocation, as needed, is handled at level 2 or 3)
386 // Targets observed in page tables:
387 // For PCIe: entry.target == PTE_AND_TARGET_VID_MEM;
388 // For Jetson: entry.target == PTE_AND_TARGET_SYS_MEM_NONCOHERENT;
389 entry.is_pte = 1;
390 entry.aperture = paddr_target;
391 if (paddr_target == TARGET_VID_MEM)
392 entry.addr = paddr_to_map >> 12;
393 else
394 entry.addr_w = paddr_to_map >> 12;
395 // Set the volatile bit (as NVRM does for SYS_MEM_COHERENT mappings)
396 // (This does nothing if the target is VID_MEM, but if the target is
397 // SYS_MEM_*, accesses will bypass the L2.)
398 entry.is_volatile = 1;
399 // Leave other fields zero, yielding an unencrypted, unprivileged, r/w,
400 // volatile mapping with atomics enabled.
401
402 // XXX: Hack to work around PDE0 double-size weirdness. Huge
403 // page mapping will fault without this.
404 if (level == 3)
405 writeq(entry.raw_w, next_kva - 8 + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx);
406 } else {
407 struct page* page_dir;
408 struct nvdebug_pd_page* page_dir_reinterpret;
409 dma_addr_t page_dir_dma;
410 // Allocate one 4 KiB all-zero (all invalid) page directory/
411 // table at the next level
412 if (!(page_dir = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0)))
413 return -ENOMEM;
414 // Obtain a GPU-accessible/bus address for this page (handling
415 // I/O MMU mappings, etc.)
416 page_dir_dma = dma_map_page(g->dev, page_dir, 0, PAGE_SIZE, DMA_TO_DEVICE);
417 // Verify that we were able to create a mapping
418 if (dma_mapping_error(g->dev, page_dir_dma))
419 return dma_mapping_error(g->dev, page_dir_dma);
420 // Record this allocation for freeing later
421 // Note: Linux maintains a page struct for every page in the
422 // system. This struct has available space that drivers
423 // can use to store their own tracking information. Our
424 // struct nvdebug_pd_page facilitates this.
425 page_dir_reinterpret = (struct nvdebug_pd_page*)page_dir;
426 page_dir_reinterpret->parent_addr = next + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx;
427 page_dir_reinterpret->parent_aperture = next_target;
428 page_dir_reinterpret->dma_addr = page_dir_dma;
429 list_add(&page_dir_reinterpret->list, &g->pd_allocs);
430 // Point this entry to the new directory/table
431 entry.target = PD_AND_TARGET_SYS_MEM_COHERENT; // Observed in page tables
432 // Must use addr_w with SYS_MEM targets
433 entry.addr_w = page_dir_dma >> 12;
434 // On Jetson and NVRM, all PDEs are marked volatile
435 entry.is_volatile = 1;
436 // We don't configure ATS, so disable ATS lookups for speed.
437 entry.no_ats = 1;
438 }
439 writeq(entry.raw_w, next_kva + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx);
440 printk_debug(KERN_DEBUG "[nvdebug] Created %s pointing to %llx in ap '%s' at lvl %d (raw: %#018llx)\n", entry.is_pte ? "PTE" : "PDE", ((u64)entry.addr) << 12, pd_target_to_text(entry.target), level, entry.raw_w);
441 // Successfully created the requested PTE, so return
442 if (entry.is_pte)
443 return 0;
444 } else {
445 printk_debug(KERN_DEBUG "[nvdebug] Found %s pointing to %llx in ap '%s' at lvl %d (raw: %#018llx)\n", entry.is_pte ? "PTE" : "PDE", ((u64)entry.addr) << 12, pd_target_to_text(entry.target), level, entry.raw_w);
446 }
447
448 // If this is the PTE level, return success if the address and target are correct
449 if (entry.is_pte) { // level == 4 for 4 KiB pages, == 3 for 2 MiB
450 if (entry.aperture != paddr_target)
451 return -EADDRINUSE; // Also handles PEER
452 if (entry.aperture == TARGET_VID_MEM)
453 return (uint64_t)entry.addr == paddr_to_map >> 12 ? 1 : -EADDRINUSE;
454 else
455 return entry.addr_w == paddr_to_map >> 12 ? 1 : -EADDRINUSE; // SYS_MEM is wider
456 }
457
458 // If mapping a 2 MiB page and we made it here, level 3 had a PDE. This
459 // means that the requested 2 MiB virtual region already has one or more
460 // small pages mapped within it---a.k.a., the addresses are in use.
461 // If we didn't bail out here, the above logic would attempt to fallback
462 // to a 4 KiB mapping, which would be unexpected behavior.
463 if (huge_page && level == 3)
464 return -EADDRINUSE;
465
466 // Otherwise step to the next table level
467 if (entry.aperture == TARGET_VID_MEM)
468 next = (uint64_t)entry.addr << 12;
469 else
470 next = (uint64_t)entry.addr_w << 12; // SYS_MEM is wider
471 next_target = entry.target;
472 }
473
474 return -ENOTRECOVERABLE; // Should be impossible
475}
476
146/* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables 477/* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables
147 (See `search_page_directory()` for documentation.) 478 (See `search_page_directory()` for documentation.)
148 */ 479 */
@@ -187,7 +518,7 @@ uint64_t search_v1_page_directory(struct nvdebug_state *g,
187 // Verify PDE is present 518 // Verify PDE is present
188 if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID) 519 if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
189 continue; 520 continue;
190// printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw); 521 // TODO: Handle huge pages
191 printk_debug(KERN_DEBUG "[nvdebug] Found %s PDE at index %lld pointing to PTEs @ %#018llx in ap '%d' (raw: %#018llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", i, ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw); 522 printk_debug(KERN_DEBUG "[nvdebug] Found %s PDE at index %lld pointing to PTEs @ %#018llx in ap '%d' (raw: %#018llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", i, ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw);
192 // For each PTE 523 // For each PTE
193 for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) { 524 for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
@@ -215,7 +546,84 @@ uint64_t search_v1_page_directory(struct nvdebug_state *g,
215 return 0; 546 return 0;
216} 547}
217 548
549/* GPU Virtual address -> Physical address ("forward" translation) for V1 tables
550 (See `translate_page_directory()` for documentation.)
551*/
552int translate_v1_page_directory(struct nvdebug_state *g,
553 page_dir_config_t pd_config,
554 uint64_t addr_to_find,
555 uint64_t *found_addr /* out */,
556 enum INST_TARGET *found_aperture /* out */) {
557 page_dir_entry_v1_t pde;
558 page_tbl_entry_v1_t pte;
559 uintptr_t pde_idx, pde_phys, pte_idx, pte_phys;
560 void __iomem *pte_kva, *pde_kva;
561
562 *found_addr = 0;
563 *found_aperture = TARGET_INVALID;
564
565 // Make sure that the query is page-aligned (likely mistake otherwise)
566 if (addr_to_find & 0xfff) {
567 printk(KERN_WARNING "[nvdebug] Attempting to translate unaligned address %#llx in translate_v1_page_directory()!\n", addr_to_find);
568 return -EINVAL;
569 }
570
571 // This function only understands the Page Table Version 1 format
572 if (pd_config.is_ver2) {
573 printk(KERN_ERR "[nvdebug] Passed a Version 2 page table at %#018llx to translate_v1_page_directory()!\n", (uint64_t)pd_config.page_dir << 12);
574 return -EINVAL;
575 }
576
577 // We only understand the Version 1 format when 128 KiB huge pages are in-use
578 if (pd_config.is_64k_big_page) {
579 printk(KERN_ERR "[nvdebug] Page Table Version 1 with 64 KiB huge pages is unsupported!\n");
580 return -EINVAL;
581 }
582
583 printk_info(KERN_INFO "[nvdebug] Translating addr %#018llx in V1 page table with base %#018llx\n", (uint64_t)addr_to_find, (uint64_t)pd_config.page_dir << 12);
584
585 // Shift bits which define PDE index to start at bit 0, and mask other bits
586 pde_idx = (addr_to_find >> NV_MMU_PT_V1_LSB[0]) & (NV_MMU_PT_V1_SZ[0] - 1);
587 // Compute VID_MEM/SYS_MEM address of page directory entry
588 pde_phys = ((uint64_t)pd_config.page_dir << 12) + pde_idx * sizeof(page_dir_entry_v1_t);
589 // Convert VID_MEM/SYS_MEM address to Kernel-accessible Virtual Address (KVA)
590 pde_kva = pd_deref(g, pde_phys, INST2PD_TARGET(pd_config.target));
591 if (IS_ERR_OR_NULL(pde_kva)) {
592 printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_phys, target_to_text(pd_config.target), PTR_ERR(pde_kva));
593 return PTR_ERR(pde_kva);
594 }
595 // Read page directory entry (readq seems to work fine; tested on GM204)
596 pde.raw = readq(pde_kva);
597 // Verify this PDE points to an array of page table entries
598 if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
599 return -ENXIO;
600 // TODO: Check for and handle huge pages
601 printk_debug(KERN_DEBUG "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw);
602
603 // Shift bits which define PTE index to start at bit 0, and mask other bits
604 pte_idx = (addr_to_find >> NV_MMU_PT_V1_LSB[1]) & (NV_MMU_PT_V1_SZ[1] - 1);
605 // Compute VID_MEM/SYS_MEM address of page table entry
606 pte_phys = ((uint64_t)pde.alt_addr << 12) + pte_idx * sizeof(page_tbl_entry_v1_t);
607 // Convert VID_MEM/SYS_MEM address to Kernel-accessible Virtual Address (KVA)
608 pte_kva = pd_deref(g, pte_phys, V12PD_TARGET(pde.alt_target));
609 if (IS_ERR_OR_NULL(pde_kva)) {
610 printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pte_phys, pd_target_to_text(pde.alt_target), PTR_ERR(pte_kva));
611 return PTR_ERR(pte_kva);
612 }
613 // Read page table entry
614 pte.raw = readq(pte_kva);
615 // XXX: The above readq() is bogus on gk104 (returns -1). Potential issue of pd_deref's move of PRAMIN racing with the driver?
616 if (!pte.is_present)
617 return -ENXIO;
618 printk_debug(KERN_DEBUG "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)pte.addr) << 12, target_to_text(pte.target), pte.is_volatile, pte.is_privileged, pte.is_readonly, pte.atomics_disabled, pte.raw);
619 // Access PTE and return physical address
620 *found_addr = (uint64_t)pte.addr << 12;
621 *found_aperture = pte.target;
622 return 0;
623}
624
218/* *** UNTESTED *** 625/* *** UNTESTED ***
626// This is only relevant on pre-Kepler GPUs; not a current priority
219#define NV_MMU_PT_V0_SZ 2048 627#define NV_MMU_PT_V0_SZ 2048
220#define NV_MMU_PT_V0_LSB 29 628#define NV_MMU_PT_V0_LSB 29
221uint64_t search_v0_page_directory(struct nvdebug_state *g, 629uint64_t search_v0_page_directory(struct nvdebug_state *g,
diff --git a/nvdebug.h b/nvdebug.h
index ca0f514..3ac8db4 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -2,6 +2,7 @@
2 * SPDX-License-Identifier: MIT 2 * SPDX-License-Identifier: MIT
3 * 3 *
4 * File outline: 4 * File outline:
5 * - Configuration options
5 * - Runlist, preemption, and channel control (FIFO) 6 * - Runlist, preemption, and channel control (FIFO)
6 * - Basic GPU information (MC) 7 * - Basic GPU information (MC)
7 * - Detailed GPU information (PTOP, FUSE, and CE) 8 * - Detailed GPU information (PTOP, FUSE, and CE)
@@ -20,6 +21,27 @@
20// this, so declare as incomplete type to avoid pulling in the nvgpu headers. 21// this, so declare as incomplete type to avoid pulling in the nvgpu headers.
21struct gk20a; 22struct gk20a;
22 23
24// Uncomment to, upon BAR2 access failure, return a PRAMIN-based runlist pointer
25// in get_runlist_iter(). In order for this pointer to remain valid, PRAMIN
26// **must** not be moved during runlist traversal.
27// - The Jetson TX2 has no BAR2, and stores the runlist in VID_MEM, so this
28// must be enabled to print the runlist on the TX2.
29// - On the A100 in Google Cloud and H100 in Paperspace, as of Aug 2024, this is
30// needed, as nvdebug is not finding (at least) runlist0 mapped in BAR2/3.
31// Automatically disables printing Instance Block and Context State while
32// traversing the runlist, as these require conflicting uses of PRAMIN (it's
33// needed to search the page tables for the Instance Block in BAR2/3, and to
34// access anything in the Context State---aka CTXSW).
35#define FALLBACK_TO_PRAMIN
36
37// Starting offset for registers in the corresponding named range
38// Programmable First-In First-Out unit; also known as "Host"
39#define NV_PFIFO 0x00002000 // 8 KiB long; ends prior to 0x00004000
40// Programmable Channel Control System RAM
41#define NV_PCCSR 0x00800000 // 16 KiB long; ends prior to 0x00810000
42// Programmable TOPology registers
43#define NV_PTOP 0x00022400 // 1 KiB long; ends prior to 0x00022800
44
23/* Runlist Channel 45/* Runlist Channel
24 A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue 46 A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
25 of GPU commands. These commands are typically queued from userspace. 47 of GPU commands. These commands are typically queued from userspace.
@@ -202,7 +224,7 @@ typedef union {
202 Support: Ampere, Hopper, Ada, [newer untested] 224 Support: Ampere, Hopper, Ada, [newer untested]
203*/ 225*/
204#define NV_RUNLIST_PREEMPT_GA100 0x098 226#define NV_RUNLIST_PREEMPT_GA100 0x098
205#define PREEMPT_TYPE_RUNLIST 0 227#define PREEMPT_TYPE_RUNLIST PREEMPT_TYPE_CHANNEL
206 228
207/* 229/*
208 "Initiate a preempt of the engine by writing the bit associated with its 230 "Initiate a preempt of the engine by writing the bit associated with its
@@ -355,6 +377,14 @@ typedef union {
355 uint64_t raw; 377 uint64_t raw;
356} runlist_base_tu102_t; 378} runlist_base_tu102_t;
357 379
380/*
381 LEN : Read/Write
382 OFFSET : Read/Write
383 PREEMPTED_TSGID : Read-only
384 VALID_PREEMPTED_TSGID : Read-only
385 IS_PENDING : Read-only
386 PREEMPTED_OFFSET : Read-only
387*/
358typedef union { 388typedef union {
359 struct { 389 struct {
360 uint16_t len:16; 390 uint16_t len:16;
@@ -416,6 +446,27 @@ typedef union {
416 uint32_t raw; 446 uint32_t raw;
417} runlist_channel_config_t; 447} runlist_channel_config_t;
418 448
449/* Context Switch Timeout Configuration
450 After a task's budget expires, there's a configurable grace period, a
451 "timeout", within which the context needs to complete. After this timeout
452 expires, an interrupt is raised to terminate the task.
453
454 This register configures if such a timeout is enabled and how long the
455 timeout is (the "period").
456
457 Support: Volta, Turing
458*/
459#define NV_PFIFO_ENG_CTXSW_TIMEOUT 0x00002A0C
460// Support: Ampere
461#define NV_RUNLIST_ENGINE_CTXSW_TIMEOUT_CONFIG(i) (0x220+(i)*64)
462typedef union {
463 struct {
464 uint32_t period:31;
465 bool enabled:1;
466 } __attribute__((packed));
467 uint32_t raw;
468} ctxsw_timeout_t;
469
419/* Programmable Channel Control System RAM (PCCSR) 470/* Programmable Channel Control System RAM (PCCSR)
420 512-entry array of channel control and status data structures. 471 512-entry array of channel control and status data structures.
421 472
@@ -477,8 +528,15 @@ typedef union {
477 bool busy:1; 528 bool busy:1;
478 uint32_t :3; 529 uint32_t :3;
479 } __attribute__((packed)); 530 } __attribute__((packed));
531 struct {
532 uint32_t word1;
533 uint32_t word2;
534 } __attribute__((packed));
480 uint64_t raw; 535 uint64_t raw;
481} channel_ctrl_t; 536} channel_ctrl_gf100_t;
537
538// TODO: Remove use of deprecated type name
539typedef channel_ctrl_gf100_t channel_ctrl_t;
482 540
483/* CHannel RAM (CHRAM) (PCCSR replacement on Ampere+) 541/* CHannel RAM (CHRAM) (PCCSR replacement on Ampere+)
484 Starting with Ampere, channel IDs are no longer unique indexes into the 542 Starting with Ampere, channel IDs are no longer unique indexes into the
@@ -543,6 +601,8 @@ typedef union {
543 Support: Fermi, Kepler, Maxwell, Pascal, Volta, Turing 601 Support: Fermi, Kepler, Maxwell, Pascal, Volta, Turing
544*/ 602*/
545#define NV_PFIFO_SCHED_DISABLE 0x00002630 603#define NV_PFIFO_SCHED_DISABLE 0x00002630
604// Support: Ampere
605#define NV_RUNLIST_SCHED_DISABLE 0x094
546typedef union { 606typedef union {
547 struct { 607 struct {
548 bool runlist_0:1; 608 bool runlist_0:1;
@@ -1018,7 +1078,7 @@ typedef union {
1018 struct { 1078 struct {
1019 uint32_t ptr:28; 1079 uint32_t ptr:28;
1020 enum INST_TARGET target:2; 1080 enum INST_TARGET target:2;
1021 uint32_t :1; 1081 uint32_t :1; // disable_cya_debug for BAR2
1022 bool is_virtual:1; 1082 bool is_virtual:1;
1023 } __attribute__((packed)); 1083 } __attribute__((packed));
1024 uint32_t raw; 1084 uint32_t raw;
@@ -1091,6 +1151,9 @@ typedef union {
1091 Support: Tesla 2.0* through Ampere, Ada 1151 Support: Tesla 2.0* through Ampere, Ada
1092 *FAULT_REPLAY_* fields are Pascal+ only 1152 *FAULT_REPLAY_* fields are Pascal+ only
1093 See also: dev_ram.h (open-gpu-kernel-modules) or dev_ram.ref.txt (open-gpu-doc) 1153 See also: dev_ram.h (open-gpu-kernel-modules) or dev_ram.ref.txt (open-gpu-doc)
1154
1155 It appears that on Hopper, IS_VER2 continues to mean IS_VER2, but if unset, the
1156 alternative is VER3.
1094*/ 1157*/
1095#define NV_PRAMIN_PDB_CONFIG_OFF 0x200 1158#define NV_PRAMIN_PDB_CONFIG_OFF 0x200
1096typedef union { 1159typedef union {
@@ -1101,7 +1164,7 @@ typedef union {
1101 bool fault_replay_tex:1; 1164 bool fault_replay_tex:1;
1102 bool fault_replay_gcc:1; 1165 bool fault_replay_gcc:1;
1103 uint32_t :4; 1166 uint32_t :4;
1104 bool is_ver2:1; 1167 bool is_ver2:1; // XXX: Not on Hopper. May be set or not for same page_dir.
1105 bool is_64k_big_page:1; // 128Kb otherwise 1168 bool is_64k_big_page:1; // 128Kb otherwise
1106 uint32_t page_dir_lo:20; 1169 uint32_t page_dir_lo:20;
1107 uint32_t page_dir_hi:32; 1170 uint32_t page_dir_hi:32;
@@ -1421,6 +1484,182 @@ typedef union {
1421} page_tbl_entry_v0_t; 1484} page_tbl_entry_v0_t;
1422*/ 1485*/
1423 1486
1487/* Fifo Context RAM (RAMFC) and channel INstance RAM (RAMIN)
1488
1489 Each channel is configured with a 4 KiB instance block. The prefix of this
1490 block is referred to as RAMFC and stores channel-specific state for the Host
1491 (aka PFIFO).
1492
1493 "A GPU instance block is a block of memory that contains the state
1494 for a GPU context. A GPU context's instance block consists of Host state,
1495 pointers to each engine's state, and memory management state. A GPU instance
1496 block also contains a pointer to a block of memory that contains that part of a
1497 GPU context's state that a user-level driver may access. A GPU instance block
1498 fits within a single 4K-byte page of memory."
1499
1500 "The NV_RAMFC part of a GPU-instance block contains Host's part of a virtual
1501 GPU's state. Host is referred to as "FIFO". "FC" stands for FIFO Context.
1502 When Host switches from serving one GPU context to serving a second, Host saves
1503 state for the first GPU context to the first GPU context's RAMFC area, and loads
1504 state for the second GPU context from the second GPU context's RAMFC area."
1505
1506 "Every Host word entry in RAMFC directly corresponds to a PRI-accessible
1507 register. For a description of the contents of a RAMFC entry, please see the
1508 description of the corresponding register in "manuals/dev_pbdma.ref". The
1509 offsets of the fields within each entry in RAMFC match those of the
1510 corresponding register in the associated PBDMA unit's PRI space."
1511
1512 In summary, RAMFC includes details such as the head and tail of the pushbuffer,
1513 and RAMIN includes details such as the page table configuration(s).
1514
1515 The instance-global page table (as defined in the PDB field) is only used for
1516 GPU engines which do not support subcontexts (non-VEID engines).
1517
1518 **Not all documented fields are currently populated below.**
1519
1520 Support: *Kepler, *Maxwell, *Pascal, Volta, Turing, Ampere, [newer untested]
1521 *Pre-Volta GPUs do not support subcontexts.
1522 See also: dev_ram.ref.txt and dev_pbdma.ref.txt in NVIDIA's open-gpu-doc
1523*/
1524
1525// 16-byte (128-bit) substructure defining a subcontext configuration
1526typedef struct {
1527 page_dir_config_t pdb;
1528 uint32_t pasid:20; // Process Address Space ID (PASID) used for ATS
1529 uint32_t :11;
1530 bool enable_ats:1; // Enable Address Translation Services (ATS)?
1531 uint32_t pad;
1532} __attribute__((packed)) subcontext_ctrl_t;
1533
1534typedef struct {
1535// Start RAMFC (512 bytes)
1536 uint32_t pad[43];
1537 uint32_t fc_target:5; // NV_RAMFC_TARGET; off 43
1538 uint32_t :27;
1539 uint32_t pad2[17];
1540 uint32_t fc_config_l2:1; // NV_RAMFC_CONFIG; off 61
1541 uint32_t :3;
1542 uint32_t fc_config_ce_split:1;
1543 uint32_t fc_config_ce_no_throttle:1;
1544 uint32_t :2;
1545 uint32_t fc_config_is_priv:1; // ...AUTH_LEVEL
1546 uint32_t :3;
1547 uint32_t fc_config_userd_writeback:1; // ...USERD_WRITEBACK
1548 uint32_t :19;
1549 uint32_t pad3[1];
1550 uint32_t fc_chan_info_scg:1; // ...SET_CHANNEL_INFO_SCG_TYPE
1551 uint32_t :7;
1552 uint32_t fc_chan_info_veid:6; // ...SET_CHANNEL_INFO_VEID
1553 uint32_t fc_chan_info_chid:12; // ...SET_CHANNEL_INFO_CHID
1554 uint32_t :6;
1555 uint32_t pad4[64];
1556// End RAMFC
1557// Start RAMIN
1558 page_dir_config_t pdb;
1559 uint32_t pad5[2];
1560 // WFI_TARGET appears to be ignored if WFI_IS_VIRTUAL
1561 uint32_t engine_wfi_target:2; // NV_RAMIN_ENGINE_WFI_TARGET; off 132
1562 uint32_t engine_wfi_is_virtual:1;
1563 uint32_t :9;
1564 // WFI_PTR points to a CTXSW block (documented below)
1565 uint64_t engine_wfi_ptr:52; // NV_RAMIN_ENGINE_WFI_PTR_LO/_HI; off 132--133
1566 uint32_t engine_wfi_veid:6; // NV_RAMIN_ENGINE_WFI_VEID; off 134; VEID == Subcontext ID
1567 uint32_t :26;
1568 uint32_t pasid:20; // NV_RAMIN_PASID; off 135; "Process Address Space ID"
1569 uint32_t :11;
1570 bool enable_ats:1;
1571 uint32_t pad6[30];
1572 uint64_t subcontext_pdb_valid; // NV_RAMIN_SC_PDB_VALID; off 166-167
1573 subcontext_ctrl_t subcontext[64]; // NV_RAMIN_SC_*; off 168-424
1574} __attribute__((packed)) instance_ctrl_t;
1575
1576// Context types
1577enum CTXSW_TYPE {
1578 CTXSW_UNDEFINED = 0x0,
1579 CTXSW_OPENGL = 0x8,
1580 CTXSW_DX9 = 0x10,
1581 CTXSW_DX10 = 0x11,
1582 CTXSW_DX11 = 0x12,
1583 CTXSW_COMPUTE = 0x20,
1584 CTXSW_HEADER = 0x21 // A per-subcontext header
1585};
1586static inline const char *ctxsw_type_to_text(enum CTXSW_TYPE t) {
1587 switch (t) {
1588 case CTXSW_UNDEFINED:
1589 return "[None]";
1590 case CTXSW_OPENGL:
1591 return "OpenGL";
1592 case CTXSW_DX9:
1593 case CTXSW_DX10:
1594 case CTXSW_DX11:
1595 return "DirectX";
1596 case CTXSW_COMPUTE:
1597 return "Compute";
1598 case CTXSW_HEADER:
1599 return "Header";
1600 default:
1601 return "UNKNOWN";
1602 }
1603}
1604
1605// Preemption modes:
1606// WFI: Wait For Idle (preempt on idle)
1607// CTA: Cooperative Thread Array-level Preemption (preempt at end of block)
1608// CILP: Compute-Instruction-Level Preemption (preempt at end of instruction)
1609enum GRAPHICS_PREEMPT_TYPE {PREEMPT_WFI, PREEMPT_GFXP};
1610enum COMPUTE_PREEMPT_TYPE {_PREEMPT_WFI, PREEMPT_CTA, PREEMPT_CILP};
1611static inline const char *compute_preempt_type_to_text(enum COMPUTE_PREEMPT_TYPE t) {
1612 switch (t) {
1613 case PREEMPT_WFI:
1614 return "WFI";
1615 case PREEMPT_CTA:
1616 return "CTA";
1617 case PREEMPT_CILP:
1618 return "CILP";
1619 default:
1620 return "INVALID";
1621 }
1622}
1623static inline const char *graphics_preempt_type_to_text(enum COMPUTE_PREEMPT_TYPE t) {
1624 switch (t) {
1625 case PREEMPT_WFI:
1626 return "WFI";
1627 case PREEMPT_GFXP:
1628 return "GFXP";
1629 default:
1630 return "INVALID";
1631 }
1632}
1633
1634/* ConTeXt SWitch control block (CTXSW)
1635 Support: Maxwell*, Pascal**, Volta, Turing, Ampere, Ada
1636 *Nothing except for CONTEXT_ID and TYPE
1637 **Except as noted
1638 See also: manuals/volta/gv100/dev_ctxsw.ref.txt in open-gpu-doc
1639 and hw_ctxsw_prog_*.h in nvgpu
1640*/
1641// (Note that this layout changes some generation-to-generation)
1642typedef struct context_switch_block {
1643 uint32_t pad[3];
1644 enum CTXSW_TYPE type:6; // Unused except when type CTXSW_HEADER?
1645 uint32_t :26;
1646 uint32_t pad2[26];
1647 // The context buffer ptr fields are in an opposite-of-typical order, so we
1648 // can't merge them into a single context_buffer_ptr field.
1649 uint32_t context_buffer_ptr_hi; // Volta+ only
1650 uint32_t context_buffer_ptr_lo; // Volta+ only
1651 enum GRAPHICS_PREEMPT_TYPE graphics_preemption_options:32;
1652 enum COMPUTE_PREEMPT_TYPE compute_preemption_options:32;
1653 uint32_t pad3[18];
1654 uint32_t num_wfi_save_operations;
1655 uint32_t num_cta_save_operations;
1656 uint32_t num_gfxp_save_operations;
1657 uint32_t num_cilp_save_operations;
1658 uint32_t pad4[4];
1659 uint32_t context_id;
1660 // [There are more fields not yet added here.]
1661} __attribute__((packed)) context_switch_ctrl_t;
1662
1424/* VRAM Information 1663/* VRAM Information
1425 1664
1426 If ECC is disabled: 1665 If ECC is disabled:
@@ -1452,6 +1691,12 @@ static inline uint64_t memory_range_to_bytes(memory_range_t range) {
1452 1691
1453/* Begin nvdebug types and functions */ 1692/* Begin nvdebug types and functions */
1454 1693
1694// __iomem is only defined when building as a kernel module, so conditionally
1695// define it to allow including this header outside the kernel.
1696#ifndef __iomem
1697#define __iomem
1698#endif
1699
1455// Vendor ID for PCI devices manufactured by NVIDIA 1700// Vendor ID for PCI devices manufactured by NVIDIA
1456#define NV_PCI_VENDOR 0x10de 1701#define NV_PCI_VENDOR 0x10de
1457struct nvdebug_state { 1702struct nvdebug_state {
@@ -1474,6 +1719,10 @@ struct nvdebug_state {
1474 struct platform_device *platd; 1719 struct platform_device *platd;
1475 // Pointer to generic device struct (both platform and pcie devices) 1720 // Pointer to generic device struct (both platform and pcie devices)
1476 struct device *dev; 1721 struct device *dev;
1722#ifdef __KERNEL__
1723 // List used by mmu.c to track allocated pages for page directories/tables
1724 struct list_head pd_allocs;
1725#endif
1477}; 1726};
1478 1727
1479// This disgusting macro is a crutch to work around the fact that runlists were 1728// This disgusting macro is a crutch to work around the fact that runlists were
@@ -1542,7 +1791,19 @@ int get_runlist_iter(
1542 struct runlist_iter *rl_iter /* out */); 1791 struct runlist_iter *rl_iter /* out */);
1543int preempt_tsg(struct nvdebug_state *g, uint32_t rl_id, uint32_t tsg_id); 1792int preempt_tsg(struct nvdebug_state *g, uint32_t rl_id, uint32_t tsg_id);
1544int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id); 1793int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id);
1545int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id); 1794int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id, uint32_t off);
1795instance_ctrl_t *instance_deref(
1796 struct nvdebug_state *g,
1797 uint64_t instance_addr,
1798 enum INST_TARGET instance_target);
1799context_switch_ctrl_t *get_ctxsw(
1800 struct nvdebug_state *g,
1801 instance_ctrl_t *inst);
1802int set_channel_preemption_mode(
1803 struct nvdebug_state *g,
1804 uint32_t chan_id,
1805 uint32_t rl_id,
1806 enum COMPUTE_PREEMPT_TYPE mode);
1546 1807
1547// Defined in mmu.c 1808// Defined in mmu.c
1548uint64_t search_page_directory( 1809uint64_t search_page_directory(
@@ -1550,11 +1811,33 @@ uint64_t search_page_directory(
1550 page_dir_config_t pd_config, 1811 page_dir_config_t pd_config,
1551 uint64_t addr_to_find, 1812 uint64_t addr_to_find,
1552 enum INST_TARGET addr_to_find_aperture); 1813 enum INST_TARGET addr_to_find_aperture);
1814int translate_page_directory(
1815 struct nvdebug_state *g,
1816 page_dir_config_t pd_config,
1817 uint64_t addr_to_find,
1818 uint64_t *found_addr /* out */,
1819 enum INST_TARGET *found_aperture /* out */);
1820int map_page_directory(
1821 struct nvdebug_state *g,
1822 page_dir_config_t pd_config,
1823 uint64_t paddr_to_map,
1824 uint64_t vaddr_to_find,
1825 enum INST_TARGET paddr_target,
1826 bool huge_page);
1827int gc_page_directory(
1828 struct nvdebug_state *g,
1829 bool force);
1553uint64_t search_v1_page_directory( 1830uint64_t search_v1_page_directory(
1554 struct nvdebug_state *g, 1831 struct nvdebug_state *g,
1555 page_dir_config_t pd_config, 1832 page_dir_config_t pd_config,
1556 uint64_t addr_to_find, 1833 uint64_t addr_to_find,
1557 enum INST_TARGET addr_to_find_aperture); 1834 enum INST_TARGET addr_to_find_aperture);
1835int translate_v1_page_directory(
1836 struct nvdebug_state *g,
1837 page_dir_config_t pd_config,
1838 uint64_t addr_to_find,
1839 uint64_t *found_addr /* out */,
1840 enum INST_TARGET *found_aperture /* out */);
1558// Defined in bus.c 1841// Defined in bus.c
1559int addr_to_pramin_mut(struct nvdebug_state *g, uint64_t addr, enum INST_TARGET target); 1842int addr_to_pramin_mut(struct nvdebug_state *g, uint64_t addr, enum INST_TARGET target);
1560int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd /* out */); 1843int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd /* out */);
diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index 3a10e13..c0cfa63 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -15,7 +15,7 @@
15 15
16// Enable to intercept and log GPU interrupts. Historically used to benchmark 16// Enable to intercept and log GPU interrupts. Historically used to benchmark
17// interrupt latency. 17// interrupt latency.
18#define INTERRUPT_DEBUG 0 18#define INTERRUPT_DEBUG
19 19
20// MIT is GPL-compatible. We need to be GPL-compatible for symbols like 20// MIT is GPL-compatible. We need to be GPL-compatible for symbols like
21// platform_bus_type or bus_find_device_by_name... 21// platform_bus_type or bus_find_device_by_name...
@@ -28,12 +28,20 @@ extern struct file_operations runlist_file_ops;
28extern struct file_operations preempt_tsg_file_ops; 28extern struct file_operations preempt_tsg_file_ops;
29extern struct file_operations disable_channel_file_ops; 29extern struct file_operations disable_channel_file_ops;
30extern struct file_operations enable_channel_file_ops; 30extern struct file_operations enable_channel_file_ops;
31extern struct file_operations wfi_preempt_channel_file_ops;
32extern struct file_operations cta_preempt_channel_file_ops;
33extern struct file_operations cil_preempt_channel_file_ops;
31extern struct file_operations resubmit_runlist_file_ops; 34extern struct file_operations resubmit_runlist_file_ops;
35extern struct file_operations preempt_runlist_file_ops;
36extern struct file_operations ack_bad_tsg_file_ops;
37extern struct file_operations map_mem_chid_file_ops;
38extern struct file_operations map_mem_ctxid_file_ops;
32extern struct file_operations switch_to_tsg_file_ops; 39extern struct file_operations switch_to_tsg_file_ops;
33// device_info_procfs.c 40// device_info_procfs.c
34extern struct file_operations device_info_file_ops; 41extern struct file_operations device_info_file_ops;
35extern struct file_operations nvdebug_read_reg32_file_ops; 42extern struct file_operations nvdebug_read_reg32_file_ops;
36extern struct file_operations nvdebug_read_reg_range_file_ops; 43extern struct file_operations nvdebug_read_reg_range_file_ops;
44extern struct file_operations nvdebug_read_part_file_ops;
37extern struct file_operations local_memory_file_ops; 45extern struct file_operations local_memory_file_ops;
38// copy_topology_procfs.c 46// copy_topology_procfs.c
39extern struct file_operations copy_topology_file_ops; 47extern struct file_operations copy_topology_file_ops;
@@ -71,9 +79,271 @@ const struct file_operations* compat_ops(const struct file_operations* ops) {
71} 79}
72#endif 80#endif
73 81
74#if INTERRUPT_DEBUG 82#ifdef INTERRUPT_DEBUG
83
84void nvdebug_fifo_intr(struct nvdebug_state *g) {
85 uint32_t fifo_intr_mask;// = nvdebug_readl(g, 0x02100); // PFIFO_INTR_0
86 fifo_intr_mask = nvdebug_readl(g, 0x02100); // PFIFO_INTR_0
87 if (fifo_intr_mask & 1 << 0)
88 printk(KERN_INFO "[nvdebug] - Interrupt BIND_ERROR.\n");
89 if (fifo_intr_mask & 1 << 1)
90 printk(KERN_INFO "[nvdebug] - Interrupt CTXSW_TIMEOUT.\n");
91 if (fifo_intr_mask & 1 << 4)
92 printk(KERN_INFO "[nvdebug] - Interrupt RUNLIST_IDLE.\n");
93 if (fifo_intr_mask & 1 << 5)
94 printk(KERN_INFO "[nvdebug] - Interrupt RUNLIST_AND_ENG_IDLE.\n");
95 if (fifo_intr_mask & 1 << 6)
96 printk(KERN_INFO "[nvdebug] - Interrupt RUNLIST_ACQUIRE.\n");
97 if (fifo_intr_mask & 1 << 7)
98 printk(KERN_INFO "[nvdebug] - Interrupt RUNLIST_ACQUIRE_AND_ENG_IDLE.\n");
99 if (fifo_intr_mask & 1 << 8)
100 printk(KERN_INFO "[nvdebug] - Interrupt SCHED_ERROR.\n");
101 if (fifo_intr_mask & 1 << 16)
102 printk(KERN_INFO "[nvdebug] - Interrupt CHSW_ERROR.\n");
103 if (fifo_intr_mask & 1 << 23)
104 printk(KERN_INFO "[nvdebug] - Interrupt MEMOP_TIMEOUT.\n");
105 if (fifo_intr_mask & 1 << 24)
106 printk(KERN_INFO "[nvdebug] - Interrupt LB_ERROR.\n");
107 if (fifo_intr_mask & 1 << 25) // OLD; Pascal
108 printk(KERN_INFO "[nvdebug] - Interrupt REPLAYABLE_FAULT_ERROR.\n");
109 if (fifo_intr_mask & 1 << 27) // OLD; Pascal
110 printk(KERN_INFO "[nvdebug] - Interrupt DROPPED_MMU_FAULT.\n");
111 if (fifo_intr_mask & 1 << 28) { // On Pascal, this is MMU_FAULT
112 if (g->chip_id <= NV_CHIP_ID_VOLTA) // MMU_FAULT on Pascal (nvgpu, l4t/l4t-r28.1:drivers/gpu/nvgpu/include/nvgpu/hw/gp10b/hw_fifo_gp10b.h)
113 printk(KERN_INFO "[nvdebug] - Interrupt MMU_FAULT.\n");
114 else // Repurposed starting with Turing: open-gpu-doc/manuals/turing/tu104/dev_fifo.ref.txt
115 printk(KERN_INFO "[nvdebug] - Interrupt TSG_PREEMPT_COMPLETE.\n");
116 }
117 if (fifo_intr_mask & 1 << 29)
118 printk(KERN_INFO "[nvdebug] - Interrupt PBDMA_INTR.\n");
119 if (fifo_intr_mask & 1 << 30) {
120 printk(KERN_INFO "[nvdebug] - Interrupt RUNLIST_EVENT.\n");
121 uint32_t fifo_runlist_intr_mask = nvdebug_readl(g, 0x02A00); // PFIFO_INTR_RUNLIST
122 printk(KERN_INFO "[nvdebug] - Event %#x.\n", fifo_runlist_intr_mask);
123 }
124 if (fifo_intr_mask & 1 << 31)
125 printk(KERN_INFO "[nvdebug] - Interrupt CHANNEL_INTR.\n");
126}
127
75irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) { 128irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) {
76 printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num); 129 struct nvdebug_state *g = dev;
130 u64 time = ktime_get_raw_ns(); // CLOCK_MONOTONTIC_RAW
131 // NV_PMC_INTR does not exist on Ada, so use NV_FUNC_PRIV_CPU_INTR_TOP
132 // Note that this also appears to exist on Turing
133 if (g->chip_id >= NV_CHIP_ID_TURING) {//AMPERE) {
134 int i;
135 // Despite being an indexed register, it is only documented to have on, and could only support two
136 uint32_t intr_mask0 = nvdebug_readl(g, NV_VIRTUAL_FUNCTION_FULL_PHYS_OFFSET + 0x1600); // NV_FUNC_PRIV_CPU_INTR_TOP(0)
137 uint32_t intr_mask1 = nvdebug_readl(g, NV_VIRTUAL_FUNCTION_FULL_PHYS_OFFSET + 0x1604); // NV_FUNC_PRIV_CPU_INTR_TOP(1)
138 printk(KERN_INFO "[nvdebug] Interrupt on IRQ %d with CPU_INTR_TOP(0) %#010x, ...(1) %#010x @ %llu.\n", irq_num, intr_mask0, intr_mask1, time);
139 for (i = 0; i < 8; i++) {
140 uint32_t leaf = nvdebug_readl(g, NV_VIRTUAL_FUNCTION_FULL_PHYS_OFFSET + 0x1000 + i*4); // NV_FUNC_PRIV_CPU_INTR_LEAF(0) to ...(7)
141 if (leaf)
142 printk(KERN_INFO "[nvdebug] - Interrupt leaf %d: %#010x\n", i, leaf);
143 // 131-133 & 64 are faults on tu104??? (open-gpu-doc/manuals/turing/tu104/pri_mmu_hub.ref.txt)
144 if (136 / 32 == i && 1 << (136 % 32) & leaf) // PFIFO0 ga100
145 printk(KERN_INFO "[nvdebug] - Interrupt on PFIFO0.\n");
146 if (137 / 32 == i && 1 << (137 % 32) & leaf) // PFIFO1 ga100
147 printk(KERN_INFO "[nvdebug] - Interrupt on PFIFO1.\n");
148 if (148 / 32 == i && 1 << (148 % 32) & leaf) // TIMER ga100
149 printk(KERN_INFO "[nvdebug] - Interrupt on PTIMER.\n");
150 if (152 / 32 == i && 1 << (152 % 32) & leaf) // PMU ga100
151 printk(KERN_INFO "[nvdebug] - Interrupt on PMU.\n");
152 if (156 / 32 == i && 1 << (156 % 32) & leaf) { // PBUS ga100
153 printk(KERN_INFO "[nvdebug] - Interrupt on PBUS.\n");
154 uint32_t bus_intr = nvdebug_readl(g, 0x1100); // BUS_INTR_0
155 if (bus_intr & 1 << 2) {
156 // use timer_pri_timeout_save_0_r
157 uint32_t SAVE_0 = nvdebug_readl(g, 0x00009084); // NV_PTIMER_PRI_TIMEOUT_SAVE_0
158 printk(KERN_INFO "[nvdebug] - Interrupt PRI_FECSERR on %s to address %#010x %stargeting FECS.\n", SAVE_0 & 0x2 ? "write" : "read", SAVE_0 & 0x00fffffc, SAVE_0 & 0x80000000 ? "" : "not ");
159 uint32_t SAVE_1 = nvdebug_readl(g, 0x00009088); // NV_PTIMER_PRI_TIMEOUT_SAVE_1
160 if (SAVE_1)
161 printk(KERN_INFO "[nvdebug] Data written: %#010x\n", SAVE_1);
162 uint32_t errcode = readl(g->regs + 0x0000908C); // NV_PTIMER_PRI_TIMEOUT_FECS_ERRCODE
163 if (errcode)
164 printk(KERN_INFO "[nvdebug] FECS Error Code: %#010x\n", errcode);
165 // badf5040 is a "client error" (0) of "no such address" (40)
166 // See linux-nvgpu/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_ga10b_fusa.c
167 // for how to decode.
168 }
169 if (bus_intr & 1 << 3)
170 printk(KERN_INFO "[nvdebug] - Interrupt PRI_TIMEOUT.\n");
171 if (bus_intr & 1 << 4)
172 printk(KERN_INFO "[nvdebug] - Interrupt FB_REQ_TIMEOUT.\n");
173 if (bus_intr & 1 << 5)
174 printk(KERN_INFO "[nvdebug] - Interrupt FB_ACK_TIMEOUT.\n");
175 if (bus_intr & 1 << 6)
176 printk(KERN_INFO "[nvdebug] - Interrupt FB_ACK_EXTRA.\n");
177 if (bus_intr & 1 << 7)
178 printk(KERN_INFO "[nvdebug] - Interrupt FB_RDATA_TIMEOUT.\n");
179 if (bus_intr & 1 << 8)
180 printk(KERN_INFO "[nvdebug] - Interrupt FB_RDATA_EXTRA.\n");
181 if (bus_intr & 1 << 26)
182 printk(KERN_INFO "[nvdebug] - Interrupt SW.\n");
183 if (bus_intr & 1 << 27)
184 printk(KERN_INFO "[nvdebug] - Interrupt POSTED_DEADLOCK_TIMEOUT.\n");
185 if (bus_intr & 1 << 28)
186 printk(KERN_INFO "[nvdebug] - Interrupt MPMU.\n");
187 if (bus_intr & 1 << 31)
188 printk(KERN_INFO "[nvdebug] - Interrupt ACCESS_TIMEOUT.\n");
189 }
190 if (158 / 32 == i && 1 << (158 % 32) & leaf) // PRIV_RING ga100
191 printk(KERN_INFO "[nvdebug] - Interrupt on PRIV_RING.\n");
192 if (192 / 32 == i && 1 << (192 % 32) & leaf) // LEGACY_ENGINE_STALL ga100
193 printk(KERN_INFO "[nvdebug] - Interrupt on LEGACY_ENGINE_STALL.\n");
194 if (160 / 32 == i && 1 << (160 % 32) & leaf) { // (likely) rl0 ga100
195 printk(KERN_INFO "[nvdebug] - Interrupt on RUNLIST0.\n");
196 uint32_t off;
197 get_runlist_ram(g, 0, &off);
198 uint32_t rl_intr = nvdebug_readl(g, off+0x100);
199 printk(KERN_INFO "[nvdebug] - RUNLIST_INTR_0: %#x\n", rl_intr);
200 if (1 << 12 && rl_intr) { // BAD_TSG
201 printk(KERN_INFO "[nvdebug] - BAD_TSG: %#x\n", nvdebug_readl(g, off+0x174));
202 }
203 }
204 // Also getting 160, 161, and 162
205
206 //uint32_t off;
207 //get_runlist_ram(g, 12, &off);
208 //printk(KERN_INFO "[nvdebug] - rl10 vector id 0 is %x\n", nvdebug_readl(g, off+0x160)); // NV_RUNLIST_INTR_VECTORID(0)
209 // 160 is rl0 (C/G, LCE0, LCE1) vector id 0
210 // 168 is rl11 (LCE3) vector id 0
211 // 169 is rl12 (LCE4) vector id 0
212 // 171 is rl1 (SEC) vector id 0
213 // 176 is rl10 (LCE2) vector id 0
214 // 224 is rl0 vector id 1
215 // Only some interrupt vectors are hardcoded
216 }
217 // each subtree has two leafs? Each bit at the top corresponds to a subtree?
218 // So, if bit 0 is set, that means subtree 0 (concept) and leaves 0 and 1
219 // So, if bit 1 is set, that means subtree 1 (concept) and leaves 2 and 3
220 // the #define'd interrupt vectors all seem to fall in the lower leaf of subtree 2,
221 // except for INTR_HUB_ACCESS_CNTR_INTR_VECTOR is in the lower leaf of subtree 1
222 if (g->chip_id >= NV_CHIP_ID_AMPERE)
223 return IRQ_NONE;
224 }
225 uint32_t intr_mask = nvdebug_readl(g, 0x0100); // NV_PMC_INTR
226 printk(KERN_INFO "[nvdebug] Interrupt on IRQ %d with MC_INTR %#010x @ %llu.\n", irq_num, intr_mask, time);
227 // IDs likely changed Ampere+
228 //if (g->chip_id >= NV_CHIP_ID_AMPERE) {
229 // CIC is central interrupt controller
230 // the u32 passed around nvgpu cic functions is one of the
231 // enable is nvgpu_cic_mon_intr_stall_unit_config(unit)
232 // - Calls intr_stall_unit_config(unit)
233 // - for ga, calls unit = ga10b_intr_map_mc_stall_unit_to_intr_unit(unit) (doesn't do much)
234 // - for ga, calls nvgpu_cic_mon_intr_get_unit_info()
235 // - Does *subtree = g->mc.intr_unit_info[unit].subtree;
236 // *subtree_mask = g->mc.intr_unit_info[unit].subtree_mask;
237 // - for ga, calls ga10b_intr_config() w/ subtree info
238 //uint32_t intr_stats = nvdebug_readl(g, 1600
239 //return IRQ_NONE;
240 //}
241 if (intr_mask & 1 << 5)
242 printk(KERN_INFO "[nvdebug] - Interrupt on LCE0.\n");
243 if (intr_mask & 1 << 6)
244 printk(KERN_INFO "[nvdebug] - Interrupt on LCE1.\n");
245 if (intr_mask & 1 << 7)
246 printk(KERN_INFO "[nvdebug] - Interrupt on LCE2.\n");
247 if (intr_mask & 1 << 8) {
248 printk(KERN_INFO "[nvdebug] - Interrupt on PFIFO.\n");
249 nvdebug_fifo_intr(g);
250 }
251 if (intr_mask & 1 << 9) {
252 printk(KERN_INFO "[nvdebug] - Interrupt on HUB.\n"); // "replayable_fault_pending" in nvgpu on Pascal, "HUB" on Volta+
253 // on tu104, if vector is one of the below set in new-style interrupt vector, then MMU fault
254 // - info_fault (134)
255 // - nonreplay_fault error (133)
256 // - nonreplay_fault notify (132)
257 // - replay_fault error (131)
258 // - replay_fault notify (64)
259 // (but the above fault vectors are configurable)
260 // if it's ecc_error, then not mmu error
261 // Default fault vectors from open-gpu-doc/manuals/turing/tu104/pri_mmu_hub.ref.txt
262 // Turing through (at least) Ampere (per nvgpu)
263
264 // on gv100, parse fb_niso_intr_r 0x00100a20U, where bits:
265 // - hub_access_counter notify (0)
266 // - hub_access_counter error (1)
267 // - replay_fault notify (27)
268 // - replay_fault overflow (28)
269 // - nonreplay_fault notify (29)
270 // - nonreplay_fault overflow (30)
271 // - other_fault notify (31)
272 // Volta through Turing (per nvgpu)
273
274 // On Pascal, it looks like it's a property of fifo_intr_0???
275 if (g->chip_id < NV_CHIP_ID_VOLTA)
276 nvdebug_fifo_intr(g);
277 }
278 if (intr_mask & 1 << 10)
279 printk(KERN_INFO "[nvdebug] - Interrupt on LCE3.\n");
280 if (intr_mask & 1 << 11)
281 printk(KERN_INFO "[nvdebug] - Interrupt on LCE4.\n");
282 if (intr_mask & 1 << 12) {
283 printk(KERN_INFO "[nvdebug] - Interrupt on Graphics/Compute.\n");
284 // Kepler through (at least) Ampere
285 // From open-gpu-doc/manuals/volta/gv100/dev_graphics.ref.txt
286 uint32_t graph_intr_mask = nvdebug_readl(g, 0x400100); // NV_PGRAPH_INTR
287 if (graph_intr_mask & 1 << 0)
288 printk(KERN_INFO "[nvdebug] - Interrupt NOTIFY.\n");
289 if (graph_intr_mask & 1 << 1)
290 printk(KERN_INFO "[nvdebug] - Interrupt SEMAPHORE.\n");
291 if (graph_intr_mask & 1 << 4)
292 printk(KERN_INFO "[nvdebug] - Interrupt ILLEGAL_METHOD.\n");
293 if (graph_intr_mask & 1 << 5)
294 printk(KERN_INFO "[nvdebug] - Interrupt ILLEGAL_CLASS.\n");
295 if (graph_intr_mask & 1 << 6)
296 printk(KERN_INFO "[nvdebug] - Interrupt ILLEGAL_NOTIFY.\n");
297 if (graph_intr_mask & 1 << 7)
298 printk(KERN_INFO "[nvdebug] - Interrupt DEBUG_METHOD.\n");
299 if (graph_intr_mask & 1 << 8)
300 printk(KERN_INFO "[nvdebug] - Interrupt FIRMWARE_METHOD.\n");
301 if (graph_intr_mask & 1 << 16)
302 printk(KERN_INFO "[nvdebug] - Interrupt BUFFER_NOTIFY.\n");
303 if (graph_intr_mask & 1 << 19)
304 printk(KERN_INFO "[nvdebug] - Interrupt FECS_ERROR.\n");
305 if (graph_intr_mask & 1 << 20)
306 printk(KERN_INFO "[nvdebug] - Interrupt CLASS_ERROR.\n");
307 if (graph_intr_mask & 1 << 21)
308 printk(KERN_INFO "[nvdebug] - Interrupt EXCEPTION.\n");
309 }
310 if (intr_mask & 1 << 13)
311 printk(KERN_INFO "[nvdebug] - Interrupt on PFB.\n");
312 if (intr_mask & 1 << 15)
313 printk(KERN_INFO "[nvdebug] - Interrupt on SEC.\n");
314 if (intr_mask & 1 << 16)
315 printk(KERN_INFO "[nvdebug] - Interrupt on NVENC0.\n");
316 if (intr_mask & 1 << 17)
317 printk(KERN_INFO "[nvdebug] - Interrupt on NVDEC0.\n");
318 if (intr_mask & 1 << 18)
319 printk(KERN_INFO "[nvdebug] - Interrupt on THERMAL.\n");
320 if (intr_mask & 1 << 19)
321 printk(KERN_INFO "[nvdebug] - Interrupt on HDACODEC.\n");
322 if (intr_mask & 1 << 20)
323 printk(KERN_INFO "[nvdebug] - Interrupt on PTIMER.\n");
324 if (intr_mask & 1 << 21)
325 printk(KERN_INFO "[nvdebug] - Interrupt on PMGR.\n");
326 if (intr_mask & 1 << 22)
327 printk(KERN_INFO "[nvdebug] - Interrupt on IOCTRL.\n");
328 if (intr_mask & 1 << 23)
329 printk(KERN_INFO "[nvdebug] - Interrupt on DFD.\n");
330 if (intr_mask & 1 << 24)
331 printk(KERN_INFO "[nvdebug] - Interrupt on PMU.\n");
332 if (intr_mask & 1 << 25)
333 printk(KERN_INFO "[nvdebug] - Interrupt on LTC.\n");
334 if (intr_mask & 1 << 26)
335 printk(KERN_INFO "[nvdebug] - Interrupt on PDISP.\n");
336 if (intr_mask & 1 << 27)
337 printk(KERN_INFO "[nvdebug] - Interrupt on GSP.\n");
338 if (intr_mask & 1 << 28)
339 printk(KERN_INFO "[nvdebug] - Interrupt on PBUS.\n");
340 if (intr_mask & 1 << 29)
341 printk(KERN_INFO "[nvdebug] - Interrupt on XVE.\n");
342 if (intr_mask & 1 << 30)
343 printk(KERN_INFO "[nvdebug] - Interrupt on PRIV_RING.\n");
344 if (intr_mask & 1 << 30)
345 printk(KERN_INFO "[nvdebug] - Interrupt on SOFTWARE.\n");
346
77 return IRQ_NONE; // We don't actually handle any interrupts. Pass them on. 347 return IRQ_NONE; // We don't actually handle any interrupts. Pass them on.
78} 348}
79#endif // INTERRUPT_DEBUG 349#endif // INTERRUPT_DEBUG
@@ -135,6 +405,7 @@ int probe_and_cache_devices(void) {
135 g_nvdebug_state[i].pcid = NULL; 405 g_nvdebug_state[i].pcid = NULL;
136 g_nvdebug_state[i].platd = platd; 406 g_nvdebug_state[i].platd = platd;
137 g_nvdebug_state[i].dev = dev; 407 g_nvdebug_state[i].dev = dev;
408 INIT_LIST_HEAD(&g_nvdebug_state[i].pd_allocs);
138 // Don't check Chip ID until everything else is initalized 409 // Don't check Chip ID until everything else is initalized
139 ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0); 410 ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
140 if (ids.raw == -1) { 411 if (ids.raw == -1) {
@@ -152,6 +423,11 @@ int probe_and_cache_devices(void) {
152 mc_boot_0_t ids; 423 mc_boot_0_t ids;
153 g_nvdebug_state[i].g = NULL; 424 g_nvdebug_state[i].g = NULL;
154 // Map BAR0 (GPU control registers) 425 // Map BAR0 (GPU control registers)
426 // XXX: Don't use pci_iomap. This adds support for I/O registers, but we do
427 // not use the required ioread/write functions for those regions. We
428 // should use pci_ioremap_bar, which is explictly for MMIO regions.
429 // pci_ioremap_bar -> ioremap_nocache (all platforms)
430 // pci_iomap -> ioremap_nocache (on x86)
155 g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0); 431 g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0);
156 if (!g_nvdebug_state[i].regs) { 432 if (!g_nvdebug_state[i].regs) {
157 pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n"); 433 pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n");
@@ -163,9 +439,14 @@ int probe_and_cache_devices(void) {
163 // (vesafb may map the top half for display) 439 // (vesafb may map the top half for display)
164 if (!g_nvdebug_state[i].bar3) 440 if (!g_nvdebug_state[i].bar3)
165 g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2); 441 g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2);
442 // Observed on H100, BAR2, moved it BAR3, was moved to BAR4, and BAR1
443 // was moved to BAR2.
444 if (!g_nvdebug_state[i].bar3)
445 g_nvdebug_state[i].bar3 = pci_iomap(pcid, 4, 0);
166 g_nvdebug_state[i].pcid = pcid; 446 g_nvdebug_state[i].pcid = pcid;
167 g_nvdebug_state[i].platd = NULL; 447 g_nvdebug_state[i].platd = NULL;
168 g_nvdebug_state[i].dev = &pcid->dev; 448 g_nvdebug_state[i].dev = &pcid->dev;
449 INIT_LIST_HEAD(&g_nvdebug_state[i].pd_allocs);
169 // Don't check Chip ID until everything else is initalized 450 // Don't check Chip ID until everything else is initalized
170 ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0); 451 ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
171 if (ids.raw == -1) { 452 if (ids.raw == -1) {
@@ -175,9 +456,17 @@ int probe_and_cache_devices(void) {
175 g_nvdebug_state[i].chip_id = ids.chip_id; 456 g_nvdebug_state[i].chip_id = ids.chip_id;
176 printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.", 457 printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.",
177 ids.chip_id, ARCH2NAME(ids.architecture)); 458 ids.chip_id, ARCH2NAME(ids.architecture));
178#if INTERRUPT_DEBUG 459#ifdef INTERRUPT_DEBUG
179 if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) { 460 // For this to work, you must also add IRQF_SHARED to the flags
180 printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n"); 461 // argument of the request_threaded_irq() call in the nvidia driver
462 // (file /usr/src/nvidia.../nvidia/nv.c and nv-msi.c with dkms)
463 // Then run:
464 // sudo dkms remove nvidia-srv/VER -k $(uname -r)
465 // sudo dkms install nvidia-srv/VER -k $(uname -r) --force
466 // where VER is the version of the nvidia module (eg. 535.216.03)
467 int err;
468 if ((err = request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", &g_nvdebug_state[i]))) {
469 printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap, error %d\n", err);
181 } 470 }
182#endif // INTERRUPT_DEBUG 471#endif // INTERRUPT_DEBUG
183 i++; 472 i++;
@@ -335,6 +624,40 @@ int __init nvdebug_init(void) {
335 "enable_channel", 0222, chram_scope, compat_ops(&enable_channel_file_ops), 624 "enable_channel", 0222, chram_scope, compat_ops(&enable_channel_file_ops),
336 (void*)last_runlist)) 625 (void*)last_runlist))
337 goto out_nomem; 626 goto out_nomem;
627 // Create file `/proc/gpu#/runlist#/wfi_preempt_channel`, world writable
628 // On Turing and older, `/proc/gpu#/wfi_preempt_channel`
629 if (!proc_create_data(
630 "wfi_preempt_channel", 0222, chram_scope, compat_ops(&wfi_preempt_channel_file_ops),
631 (void*)last_runlist))
632 goto out_nomem;
633 // Create file `/proc/gpu#/runlist#/cta_preempt_channel`, world writable
634 // On Turing and older, `/proc/gpu#/cta_preempt_channel`
635 if (!proc_create_data(
636 "cta_preempt_channel", 0222, chram_scope, compat_ops(&cta_preempt_channel_file_ops),
637 (void*)last_runlist))
638 goto out_nomem;
639 // Compute-instruction-level (CIL) preemption is only available on Pascal+
640 if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
641 // Create file `/proc/gpu#/runlist#/cil_preempt_channel`, world writable
642 // On Turing and older, `/proc/gpu#/cil_preempt_channel`
643 if (!proc_create_data(
644 "cil_preempt_channel", 0222, chram_scope, compat_ops(&cil_preempt_channel_file_ops),
645 (void*)last_runlist))
646 goto out_nomem;
647 }
648 // Create files which enable on-GPU scheduling (Pascal+)
649 if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
650 // Create file `/proc/gpu#/map_mem_chid`, root writable
651 if (!proc_create_data(
652 "map_mem_chid", 0200, chram_scope, compat_ops(&map_mem_chid_file_ops),
653 (void*)last_runlist))
654 goto out_nomem;
655 // Create file `/proc/gpu#/map_mem_ctxid`, root writable
656 if (!proc_create_data(
657 "map_mem_ctxid", 0222, rl_dir, compat_ops(&map_mem_ctxid_file_ops),
658 (void*)last_runlist))
659 goto out_nomem;
660 }
338 } 661 }
339 // Create file `/proc/gpu#/runlist#/runlist`, world readable 662 // Create file `/proc/gpu#/runlist#/runlist`, world readable
340 if (!proc_create_data( 663 if (!proc_create_data(
@@ -346,16 +669,26 @@ int __init nvdebug_init(void) {
346 "switch_to_tsg", 0222, rl_dir, compat_ops(&switch_to_tsg_file_ops), 669 "switch_to_tsg", 0222, rl_dir, compat_ops(&switch_to_tsg_file_ops),
347 (void*)last_runlist)) 670 (void*)last_runlist))
348 goto out_nomem; 671 goto out_nomem;
672 /* On the TU104, the context scheduler (contained in the Host, aka
673 * PFIFO, unit) has been observed to sometimes to fail to schedule TSGs
674 * containing re-enabled channels. Resubmitting the runlist
675 * configuration appears to remediate this condition, and so this API
676 * is exposed to help reset GPU scheduling as necessary.
677 */
678 // Create file `/proc/gpu#/resubmit_runlist`, world writable
679 if (!proc_create_data(
680 "resubmit_runlist", 0222, rl_dir, compat_ops(&resubmit_runlist_file_ops),
681 (void*)device_id))
682 goto out_nomem;
349 } while (last_runlist-- > 0); 683 } while (last_runlist-- > 0);
350 /* On the TU104, the context scheduler (contained in the Host, aka 684 // Create file `/proc/gpu#/preempt_runlist`, world writable
351 * PFIFO, unit) has been observed to sometimes to fail to schedule TSGs
352 * containing re-enabled channels. Resubmitting the runlist
353 * configuration appears to remediate this condition, and so this API
354 * is exposed to help reset GPU scheduling as necessary.
355 */
356 // Create file `/proc/gpu#/resubmit_runlist`, world writable
357 if (!proc_create_data( 685 if (!proc_create_data(
358 "resubmit_runlist", 0222, dir, compat_ops(&resubmit_runlist_file_ops), 686 "preempt_runlist", 0222, dir, compat_ops(&preempt_runlist_file_ops),
687 (void*)device_id))
688 goto out_nomem;
689 // Create file `/proc/gpu#/ack_bad_tsg`, world writable
690 if (!proc_create_data(
691 "ack_bad_tsg", 0222, dir, compat_ops(&ack_bad_tsg_file_ops),
359 (void*)device_id)) 692 (void*)device_id))
360 goto out_nomem; 693 goto out_nomem;
361 // Create file `/proc/gpu#/device_info`, world readable 694 // Create file `/proc/gpu#/device_info`, world readable
@@ -394,6 +727,68 @@ int __init nvdebug_init(void) {
394 (void*)NV_FUSE_GPC_GM107)) 727 (void*)NV_FUSE_GPC_GM107))
395 goto out_nomem; 728 goto out_nomem;
396 } 729 }
730 // Create file `/proc/gpu#/CWD_SM_ID#`, world readable (Maxwell+)
731 // Create file `/proc/gpu#/CWD_GPC_TPC_ID#`, world readable (Maxwell+)
732 // - 6 entries on Maxwell (nvgpu)
733 // - 16 entries on Pascal through Ampere (at least) (nvgpu, open-gpu-doc)
734 // - 24 entries on Hopper through Ada (at least) (XXXX)
735 // XXX: Only working while a context is active
736 // XXX: Needed for libsmctrl2; hacky
737 // Tested on GP104, TU102, GV100, AD102
738 if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_HOPPER) {
739 char file_name[21];
740 long i;
741 for (i = 0; i < 24; i++) {
742 snprintf(file_name, 20, "CWD_SM_ID%ld", i);
743 if (!proc_create_data(
744 file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
745 (void*)(0x00405100+4*i))) // XXX: From XXXX
746 goto out_nomem;
747 // 18 entries on Ada (RTX 6000 Ada)
748 // Returns 0xbadf1201 if GPU not active
749 snprintf(file_name, 20, "CWD_GPC_TPC_ID%ld", i);
750 if (!proc_create_data(
751 file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
752 // Nothing between this location and CWD_SM_ID
753 (void*)((0x00405000)+4*i))) // Found via reverse search from CWD_SM_ID location on Ada
754 goto out_nomem;
755 // Nothing in the following 28 words (before 0x00405220)
756 }
757 } else if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL) {
758 char file_name[21];
759 long i;
760 union reg_range num_gpc_range;
761 for (i = 0; i < 16; i++) {
762 snprintf(file_name, 20, "CWD_SM_ID%ld", i);
763 if (!proc_create_data(
764 file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
765 (void*)(0x00405ba0+4*i))) // NV_PGRAPH_PRI_CWD_SM_ID(i)
766 goto out_nomem;
767 // ? entries on Maxwell
768 // 8 entries on Pascal (test)
769 // 16 entries on Volta through Ampere (open-gpu-doc)
770 // Returns 0 if GPU ont active
771 snprintf(file_name, 20, "CWD_GPC_TPC_ID%ld", i);
772 if (!proc_create_data(
773 file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
774 (void*)(0x00405b60+4*i))) // NV_PGRAPH_PRI_CWD_GPC_TPC_ID(i)
775 goto out_nomem;
776 }
777 num_gpc_range.offset = 0x00405b00; // NV_PGRAPH_PRI_CWD_FS
778 // Lower eight bits of register are _NUM_GPCS
779 num_gpc_range.start_bit = 0;
780 num_gpc_range.stop_bit = 8;
781 if (!proc_create_data(
782 "CWD_FS_NUM_GPCS", 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
783 (void*)(num_gpc_range.raw)))
784 goto out_nomem;
785 num_gpc_range.start_bit = 8;
786 num_gpc_range.stop_bit = 16;
787 if (!proc_create_data(
788 "CWD_FS_NUM_TPCS", 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
789 (void*)(num_gpc_range.raw)))
790 goto out_nomem;
791 }
397 // Create file `/proc/gpu#/local_memory`, world readable (Pascal+) 792 // Create file `/proc/gpu#/local_memory`, world readable (Pascal+)
398 if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) { 793 if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
399 if (!proc_create_data( 794 if (!proc_create_data(
@@ -414,6 +809,50 @@ int __init nvdebug_init(void) {
414 (void*)NV_CE_PCE_MAP)) 809 (void*)NV_CE_PCE_MAP))
415 goto out_nomem; 810 goto out_nomem;
416 } 811 }
812 // Create files exposing subcontext partitioning (Volta+)
813 // TODO: Make this not a hack with undocumented magic numbers
814 if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_VOLTA) {
815 char file_name[21];
816 long i;
817 // Create file `/proc/gpu#/partition_ctl`, world readable
818 if (!proc_create_data(
819 "partition_ctl", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
820 (void*)0x00405b2c))
821 goto out_nomem;
822 // Create file `/proc/gpu#/partition_data`, world readable
823 if (!proc_create_data(
824 "partition_data", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
825 (void*)0x00405b30))
826 goto out_nomem;
827 // Create file `/proc/gpu#/partition_data#`, world readable
828 for (i = 0; i < 64; i++) {
829 snprintf(file_name, 20, "partition_data%ld", i);
830 if (!proc_create_data(
831 file_name, 0444, dir, compat_ops(&nvdebug_read_part_file_ops),
832 (void*)i))
833 goto out_nomem;
834 }
835 // For debugging what MPS is changing
836 // Create file `/proc/gpu#/CWD_CG0`, world readable
837 if (!proc_create_data(
838 "CWD_CG0", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
839 (void*)0x00405bf0))
840 goto out_nomem;
841 // Create file `/proc/gpu#/CWD_CG1`, world readable
842 if (!proc_create_data(
843 "CWD_CG1", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
844 (void*)0x00405bf4))
845 goto out_nomem;
846 // Create file `/proc/gpu#/CWD_GPC_TPC_ID#`, world readable
847 // This does not appear to work on Hopper. Works on Ampere.
848 /*for (i = 0; i < 16; i++) {
849 snprintf(file_name, 20, "CWD_GPC_TPC_ID%ld", i);
850 if (!proc_create_data(
851 file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
852 (void*)(0x00405b60+4*i)))
853 goto out_nomem;
854 }*/
855 }
417 } 856 }
418 // (See Makefile if you want to know the origin of GIT_HASH.) 857 // (See Makefile if you want to know the origin of GIT_HASH.)
419 printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n"); 858 printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n");
@@ -439,16 +878,19 @@ static void __exit nvdebug_exit(void) {
439 char device_id[7]; 878 char device_id[7];
440 snprintf(device_id, 7, "gpu%d", g_nvdebug_devices); 879 snprintf(device_id, 7, "gpu%d", g_nvdebug_devices);
441 remove_proc_subtree(device_id, NULL); 880 remove_proc_subtree(device_id, NULL);
881 // Force-free associated allocations
442 g = &g_nvdebug_state[g_nvdebug_devices]; 882 g = &g_nvdebug_state[g_nvdebug_devices];
883 gc_page_directory(g, true);
443 // Free BAR mappings for PCIe devices 884 // Free BAR mappings for PCIe devices
444 if (g && g->pcid) { 885 if (g && g->pcid) {
886#ifdef INTERRUPT_DEBUG
887 // IRQ handler uses g->regs, so free IRQ first
888 free_irq(g->pcid->irq, g);
889#endif // INTERRUPT_DEBUG
445 if (g->regs) 890 if (g->regs)
446 pci_iounmap(g->pcid, g->regs); 891 pci_iounmap(g->pcid, g->regs);
447 if (g->bar2) 892 if (g->bar2)
448 pci_iounmap(g->pcid, g->bar2); 893 pci_iounmap(g->pcid, g->bar2);
449#if INTERRUPT_DEBUG
450 free_irq(g->pcid->irq, g->pcid);
451#endif // INTERRUPT_DEBUG
452 } else { 894 } else {
453 if (g->regs) 895 if (g->regs)
454 iounmap(g->regs); 896 iounmap(g->regs);
diff --git a/nvdebug_linux.h b/nvdebug_linux.h
index 2ad4ce1..b232720 100644
--- a/nvdebug_linux.h
+++ b/nvdebug_linux.h
@@ -20,6 +20,11 @@ static inline struct gk20a *get_gk20a(struct device *dev) {
20#define pde_data PDE_DATA 20#define pde_data PDE_DATA
21#endif 21#endif
22 22
23// iommu_map() requires an extra parameter on Linux 6.3+
24#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,3,0)
25#define iommu_map(a, b, c, d, e) iommu_map(a, b, c, d, e, GFP_KERNEL)
26#endif
27
23// We us the data field of the proc_dir_entry ("PDE" in this function) to store 28// We us the data field of the proc_dir_entry ("PDE" in this function) to store
24// our index into the g_nvdebug_state array 29// our index into the g_nvdebug_state array
25static inline int seq2gpuidx(struct seq_file *s) { 30static inline int seq2gpuidx(struct seq_file *s) {
diff --git a/runlist.c b/runlist.c
index 7bb2ee4..3076d27 100644
--- a/runlist.c
+++ b/runlist.c
@@ -1,19 +1,13 @@
1/* Copyright 2024 Joshua Bakita 1/* Copyright 2024 Joshua Bakita
2 * Helpers for dealing with the runlist and other Host (PFIFO) registers 2 * Helpers for dealing with the runlist and other Host (PFIFO) registers
3 */ 3 */
4#include <linux/iommu.h> // iommu_get_domain_for_dev() and iommu_iova_to_phys()
4#include <linux/printk.h> // For printk() 5#include <linux/printk.h> // For printk()
5#include <asm/errno.h> // For error defines 6#include <asm/errno.h> // For error defines
6#include <asm/io.h> // For phys_to_virt() 7#include <asm/io.h> // For phys_to_virt()
7 8
8#include "nvdebug.h" 9#include "nvdebug.h"
9 10
10// Uncomment to, upon BAR2 access failure, return a PRAMIN-based runlist pointer
11// in get_runlist_iter(). In order for this pointer to remain valid, PRAMIN
12// **must** not be moved during runlist traversal.
13// The Jetson TX2 has no BAR2, and stores the runlist in VID_MEM, so this must
14// be enabled to print the runlist on the TX2.
15//#define FALLBACK_TO_PRAMIN
16
17/* Get RunList RAM (RLRAM) offset for a runlist from the device topology 11/* Get RunList RAM (RLRAM) offset for a runlist from the device topology
18 @param rl_id Which runlist to obtain [numbered in order of appearance in 12 @param rl_id Which runlist to obtain [numbered in order of appearance in
19 the device topology (PTOP) registers] 13 the device topology (PTOP) registers]
@@ -116,6 +110,7 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl
116 runlist_len = submit.len; 110 runlist_len = submit.len;
117 printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n", 111 printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n",
118 rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw); 112 rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw);
113 printk(KERN_INFO "[nvdebug] Runlist offset is %d\n", submit.offset);
119 rl_iter->runlist_pri_base = runlist_pri_base; 114 rl_iter->runlist_pri_base = runlist_pri_base;
120 } 115 }
121 // Return early on an empty runlist 116 // Return early on an empty runlist
@@ -130,6 +125,12 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl
130 if ((err = get_bar2_pdb(g, &pd_config)) < 0) 125 if ((err = get_bar2_pdb(g, &pd_config)) < 0)
131 goto attempt_pramin_access; 126 goto attempt_pramin_access;
132 127
128 // XXX: PD version detection not working on Hopper [is_ver2 errantly (?) unset]
129 if (g->chip_id >= NV_CHIP_ID_HOPPER && g->chip_id < NV_CHIP_ID_ADA) {
130 printk(KERN_WARNING "[nvdebug] V3 page tables do not currently work on Hopper! Mystery config: %llx\n", pd_config.raw);
131 err = -EOPNOTSUPP;
132 goto attempt_pramin_access;
133 }
133 if (pd_config.is_ver2) 134 if (pd_config.is_ver2)
134 runlist_bar_vaddr = search_page_directory(g, pd_config, runlist_iova, TARGET_VID_MEM); 135 runlist_bar_vaddr = search_page_directory(g, pd_config, runlist_iova, TARGET_VID_MEM);
135 else 136 else
@@ -233,7 +234,7 @@ int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id) {
233} 234}
234 235
235// Read and write runlist configuration, triggering a resubmit 236// Read and write runlist configuration, triggering a resubmit
236int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) { 237int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id, uint32_t off) {
237 // Necessary registers do not exist pre-Fermi 238 // Necessary registers do not exist pre-Fermi
238 if (g->chip_id < NV_CHIP_ID_FERMI) 239 if (g->chip_id < NV_CHIP_ID_FERMI)
239 return -EOPNOTSUPP; 240 return -EOPNOTSUPP;
@@ -252,6 +253,9 @@ int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) {
252 return -EINVAL; 253 return -EINVAL;
253 if ((submit.raw = nvdebug_readq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id))) == -1) 254 if ((submit.raw = nvdebug_readq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id))) == -1)
254 return -EIO; 255 return -EIO;
256 preempt_runlist(g, rl_id);
257 if (off != -1)
258 submit.offset = off;
255 nvdebug_writeq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id), submit.raw); 259 nvdebug_writeq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id), submit.raw);
256 } else { 260 } else {
257 int err; 261 int err;
@@ -261,6 +265,9 @@ int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) {
261 return err; 265 return err;
262 if ((submit.raw = nvdebug_readq(g, runlist_pri_base + NV_RUNLIST_SUBMIT_GA100)) == -1) 266 if ((submit.raw = nvdebug_readq(g, runlist_pri_base + NV_RUNLIST_SUBMIT_GA100)) == -1)
263 return -EIO; 267 return -EIO;
268 preempt_runlist(g, rl_id);
269 if (off != -1)
270 submit.offset = off;
264 // On Ampere, this does not appear to trigger a preempt of the 271 // On Ampere, this does not appear to trigger a preempt of the
265 // currently-running channel (even if the currently running channel 272 // currently-running channel (even if the currently running channel
266 // becomes disabled), but will cause newly re-enabled channels 273 // becomes disabled), but will cause newly re-enabled channels
@@ -270,3 +277,255 @@ int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) {
270 } 277 }
271 return 0; 278 return 0;
272} 279}
280
281/* Get a CPU-accessible pointer to an arbitrary-address-space instance block
282 @param instance_addr Address of instance block
283 @param intasce_target Aperture/taget of instance block address
284 @return A dereferencable KVA, NULL if not found, or an ERR_PTR-wrapped error
285
286 Note: The returned address will be a BAR2 or physical address, mapped into
287 kernel space, /not/ a PRAMIN-derived address. Thus, the returned
288 address will have an indefinite lifetime, and will be uneffected by use
289 of PRAMIN elsewhere (such as to read the CTXSW block).
290*/
291instance_ctrl_t *instance_deref(struct nvdebug_state *g, uint64_t instance_addr,
292 enum INST_TARGET instance_target) {
293 if (!instance_addr || instance_target == TARGET_INVALID)
294 return ERR_PTR(-EINVAL);
295 if (instance_target == TARGET_VID_MEM) {
296 int err;
297 uint64_t inst_bar_vaddr;
298 page_dir_config_t pd_config;
299 // Only access VID_MEM via BAR2; do not fall back to PRAMIN
300 if (!g->bar2)
301 return NULL;
302 // Find page tables which define how BAR2/3 offsets are translated to
303 // physical VID/SYS_MEM addresses.
304 if ((err = get_bar2_pdb(g, &pd_config)) < 0) {
305 printk(KERN_ERR "[nvdebug] Error: Unable to access page directory "
306 "configuration for BAR2/3. Error %d.\n", err);
307 return ERR_PTR(err);
308 }
309 // Search the BAR2/3 page tables for the offset at which the instance
310 // block is mapped (reverse translation).
311 if (pd_config.is_ver2)
312 inst_bar_vaddr = search_page_directory(g, pd_config, instance_addr, instance_target);
313 else
314 inst_bar_vaddr = search_v1_page_directory(g, pd_config, instance_addr, instance_target);
315 if (!inst_bar_vaddr) {
316 printk(KERN_WARNING "[nvdebug] Warning: Instance block %#018llx "
317 "(%s) appears unmapped in BAR2/3.\n", instance_addr,
318 target_to_text(instance_target));
319 return NULL;
320 }
321 return g->bar2 + inst_bar_vaddr;
322 } else {
323 struct iommu_domain *dom;
324 // SYS_MEM addresses are physical addresses *from the perspective of
325 // the device* ("bus addresses"), and may not necessarially correspond
326 // to physical addresses from the perspective of the CPU. The I/O MMU
327 // is responsible for mapping bus addresses to CPU-relative physical
328 // addresses when there is no direct correspondence. If an I/O MMU is
329 // enabled on this GPU, ask it to translate the bus address to a
330 // CPU-relative physical address.
331 if ((dom = iommu_get_domain_for_dev(g->dev))) {
332 // XXX: As of Aug 2024, this is not tested, so include extra logging
333 printk(KERN_DEBUG "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#llx for instance block", instance_addr);
334 if (!(instance_addr = iommu_iova_to_phys(dom, instance_addr))) {
335 printk(KERN_ERR "[nvdebug] Error: I/O MMU failed to translate "
336 "%#018llx (%s) to a CPU-relative physical address.\n",
337 instance_addr, target_to_text(instance_target));
338 return ERR_PTR(-EADDRNOTAVAIL);
339 }
340 printk(KERN_DEBUG " to physical address %#llx.\n", instance_addr);
341 }
342 // Convert from a physical address to a kernel virtual address (KVA)
343 return phys_to_virt(instance_addr);
344 }
345}
346
347/* Get a CPU-accessible pointer to the CTXSW block for a channel intance block
348 @param inst Dereferencable pointer to the start of a complete instance block
349 @return A dereferencable KVA, NULL if not found, or an ERR_PTR-wrapped error
350
351 Note: The returned address **will** be a PRAMIN-based address. Any changes to
352 PRAMIN **will** invalidate the returned pointer. `inst` **cannot** be a
353 pointer into the PRAMIN space.
354*/
355context_switch_ctrl_t *get_ctxsw(struct nvdebug_state *g,
356 instance_ctrl_t *inst) {
357 int err;
358 context_switch_ctrl_t *wfi = NULL;
359 uint64_t wfi_virt, wfi_phys, ctxsw_virt, ctxsw_phys;
360 enum INST_TARGET wfi_phys_aperture, ctxsw_phys_aperture;
361
362 // The WFI block contains a pointer to the CTXSW block, which contains the
363 // preemption mode configuration for the context. (As best I can tell, the WFI
364 // block is subcontext-specific, whereas the CTXSW block is context-wide.
365 wfi_virt = (uint64_t)inst->engine_wfi_ptr << 12;
366
367 // WFI may not be configured
368 if (!wfi_virt)
369 goto out;
370
371 // Determine the physical location of the WFI block
372 if (inst->engine_wfi_is_virtual) {
373 if (inst->pdb.is_ver2)
374 err = translate_page_directory(g, inst->pdb, wfi_virt, &wfi_phys, &wfi_phys_aperture);
375 else
376 err = translate_v1_page_directory(g, inst->pdb, wfi_virt, &wfi_phys, &wfi_phys_aperture);
377 if (err) {
378 printk(KERN_ERR "[nvdebug] Critical: Inconsistent GPU state; WFI block "
379 "pointer %#018llx (virt) cannot be found in process page tables! "
380 "Translation error %d.\n", wfi_virt, -err);
381 return ERR_PTR(-ENOTRECOVERABLE);
382 }
383 } else {
384 wfi_phys = (uint64_t)inst->engine_wfi_ptr << 12;
385 wfi_phys_aperture = inst->engine_wfi_target;
386 }
387
388 // Get a dereferencible pointer to the WFI block (the WFI and CTXSW blocks
389 // have not been observed as mapped in BAR2/3, so we use the PRAMIN window).
390 // Note: On Jetson boards, we could attempt to avoid PRAMIN since CTXSW is in
391 // SYS_MEM, but this function will always need to use PRAMIN to work
392 // around the WFI and CTXSW blocks not being accessible via BAR2/3 on
393 // PCIe GPU, so always use PRAMIN for simplicity.
394 if ((wfi_phys = addr_to_pramin_mut(g, wfi_phys, wfi_phys_aperture)) == -1)
395 goto out;
396 wfi = g->regs + wfi_phys + NV_PRAMIN;
397
398// XXX
399// return wfi;
400// End XXX
401
402 // While the WFI block uses the same layout as the context switch (CTXSW)
403 // control block, it is mostly unpopulated except for a few pointers on GPUs
404 // after Volta. This appears to be related to subcontexts, where each
405 // subcontext has its own WFI block containing a pointer to the overarching
406 // CTXSW block. Only attempt to find the overarching CTXSW block if at least
407 // one subcontext is enabled.
408 if (inst->subcontext_pdb_valid) {
409 // Subcontexts are Volta+-only. Volta only supports Page Table Ver. 2
410 if (!inst->pdb.is_ver2)
411 return ERR_PTR(-ENOTRECOVERABLE);
412 // Obtain the address of the CXTSW block in this context
413 ctxsw_virt = wfi->context_buffer_ptr_hi;
414 ctxsw_virt <<= 32;
415 ctxsw_virt |= wfi->context_buffer_ptr_lo;
416 if (!ctxsw_virt) {
417 printk(KERN_WARNING "[nvdebug] Warning: WFI block at %#018llx (phys) "
418 "contains an empty context block pointer.\n", wfi_phys);
419 goto out;
420 }
421
422 // All the pointers in the WFI block are virtual, so convert the CTXSW
423 // block pointer to a physical address. We should always be able to find a
424 // mapping for ctxsw_virt.
425 if ((err = translate_page_directory(g, inst->pdb, ctxsw_virt, &ctxsw_phys, &ctxsw_phys_aperture))) {
426 printk(KERN_ERR "[nvdebug] Critical: Inconsistent GPU state; context "
427 "block pointer %#018llx (virt) cannot be found in process page "
428 "tables! Translation error %d.\n", ctxsw_virt, -err);
429 return ERR_PTR(-ENOTRECOVERABLE);
430 }
431
432 // Get a dereferencible pointer to the CTXSW block (via PRAMIN; invalidates `wfi`)
433 if ((ctxsw_phys = addr_to_pramin_mut(g, ctxsw_phys, ctxsw_phys_aperture)) == -1)
434 goto out;
435 return g->regs + ctxsw_phys + NV_PRAMIN;
436 } else {
437 // Without subcontexts, the WFI block is the CTXSW block (ex: Pascal)
438 return wfi;
439 }
440out:
441 return NULL;
442}
443
444/* Change the preemption type to be used on a context's budget expiration
445 @param chan_id As context IDs are hard to obtain and use, this function takes
446 a channel ID and looks up and modifies the associated context.
447 @param rl_id Which channel RAM address space is this channel ID in? (Not
448 used on pre-Ampere GPUs.)
449 @param mode Preemption mode to set.
450 @return 0 or -errno on error
451
452 Note: This change will not apply if the channel's context has running work,
453 or if the GPU is idle and this channel's context was last to run.
454 Please ensure some other task is running before calling this API.
455*/
456int set_channel_preemption_mode(struct nvdebug_state *g, uint32_t chan_id,
457 uint32_t rl_id,
458 enum COMPUTE_PREEMPT_TYPE mode) {
459 uint64_t instance_ptr = 0;
460 enum INST_TARGET instance_target;
461 instance_ctrl_t *inst = NULL;
462 context_switch_ctrl_t *ctxsw = NULL;
463 struct runlist_iter rl_iter;
464 uint32_t ctxsw_timeout_pri_base = NV_PFIFO_ENG_CTXSW_TIMEOUT;
465 // Obtain the instance block
466 if (g->chip_id < NV_CHIP_ID_AMPERE) {
467 // Pre-Ampere, Channel RAM includes instance block pointers
468 channel_ctrl_t chan;
469 if (chan_id > MAX_CHID)
470 return -ERANGE;
471 if ((chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chan_id))) == -1)
472 return -EIO;
473 instance_ptr = (uint64_t)chan.inst_ptr << 12;
474 instance_target = chan.inst_target;
475 } else {
476 // Starting with Ampere, instance block pointers are only included in
477 // runlist entries. Something like this could work on Maxwell+, but
478 // access via Channel RAM is more heavily-tested.
479 struct gv100_runlist_chan* chan;
480 int err;
481 loff_t pos = 0;
482 // Based off logic of switch_to_tsg_file_write() in runlist_procfs.c
483 if ((err = get_runlist_iter(g, rl_id, &rl_iter)))
484 return err;
485 while (pos < rl_iter.len && !instance_ptr) {
486 for_chan_in_tsg(g, chan, rl_iter.curr_entry) {
487 if (chan_id == chid(g, chan)) {
488 // Channel entry found in runlist. Extract instance ptr.
489 instance_ptr = (uint64_t)chan->inst_ptr_hi << 32;
490 instance_ptr |= (uint64_t)inst_ptr_lo(g, chan) << 12;
491 instance_target = inst_target(g, chan);
492 break;
493 }
494 }
495 pos += 1 + tsg_length(g, rl_iter.curr_entry);
496 rl_iter.curr_entry = next_tsg(g, rl_iter.curr_entry);
497 }
498 // Context switch timeout configuration register was moved with Ampere+
499 ctxsw_timeout_pri_base = rl_iter.runlist_pri_base + NV_RUNLIST_ENGINE_CTXSW_TIMEOUT_CONFIG(0);
500 }
501 if (!instance_ptr)
502 return -ENOENT;
503 // Obtain an instance block pointer routed via BAR2 or SYS_MEM
504 inst = instance_deref(g, instance_ptr, instance_target);
505 if (IS_ERR_OR_NULL(inst))
506 return PTR_ERR(inst);
507 // Obtain pointer to CTXSW block routed via PRAMIN (the CTXSW block
508 // does not appear to be mapped into BAR2).
509 ctxsw = get_ctxsw(g, inst);
510 if (IS_ERR_OR_NULL(ctxsw))
511 return PTR_ERR(ctxsw);
512 ctxsw->compute_preemption_options = mode;
513 // If switching to a preemption mode that runs blocks or kernels non-
514 // -preemptively (CTA-level and WFI respectively), disable the context switch
515 // timeout. If switching to compute-instruction-level preemption (CILP),
516 // reenable it. Observed to be necessary on (at least) gv11b, tu102, and ga10b
517 // XXX: On ga10b (at least), the timeout configuration is reset on a resume
518 // from suspend, overwriting the change made here. This causes a CTXSW
519 // TIMEOUT interrupt to be triggered if any application tries to run
520 // non-preemptively for longer than the timeout period (3100ms on gv11b
521 // and ga10b).
522 if (g->chip_id >= NV_CHIP_ID_VOLTA) {
523 ctxsw_timeout_t timeout_config;
524 if ((timeout_config.raw = nvdebug_readl(g, ctxsw_timeout_pri_base)) == -1)
525 return -EIO;
526 printk(KERN_DEBUG "[nvdebug] Previous Ctx. Sw. Timeout Configuration: period %d %s\n", timeout_config.period, timeout_config.enabled ? "enabled" : "disabled");
527 timeout_config.enabled = mode == PREEMPT_CILP;
528 nvdebug_writel(g, ctxsw_timeout_pri_base, timeout_config.raw);
529 }
530 return 0;
531}
diff --git a/runlist_procfs.c b/runlist_procfs.c
index b2159f6..a3a6df3 100644
--- a/runlist_procfs.c
+++ b/runlist_procfs.c
@@ -1,12 +1,117 @@
1#include <linux/seq_file.h> // For seq_* functions and types 1#include <linux/seq_file.h> // For seq_* functions and types
2#include <linux/version.h> // Macros to detect kernel version 2#include <linux/version.h> // Macros to detect kernel version
3#include <linux/platform_device.h> // For platform_get_resource()
4#include <linux/pci.h> // For pci_resource_start()
5#include <linux/iommu.h> // For iommu_ functions
6#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,10,0)
7#include <linux/dma-map-ops.h> // For get_dma_ops()
8#endif
3 9
4#include "nvdebug_linux.h" 10#include "nvdebug_linux.h"
5 11
6// Uncomment to expand channel status information when printing the runlist 12// We cannot touch PRAMIN (via page table operations or ctxsw access) if we're
13// using it to walk the runlist
14//#ifndef FALLBACK_TO_PRAMIN
15// Uncomment to expand channel status, instance, and context information when
16// printing the runlist
7#define DETAILED_CHANNEL_INFO 17#define DETAILED_CHANNEL_INFO
18//#endif
8 19
9#ifdef DETAILED_CHANNEL_INFO 20#ifdef DETAILED_CHANNEL_INFO
21// Print the channel instance and context swtich blocks
22// XXX: THIS IS UNSAFE ON KEPLER!
23// instance_deref() will call into the page table logic, which may move PRAMIN
24// PRAMIN appears heavily utilized by the driver on Bonham (at least), and
25// moving it causes problems.
26static int runlist_detail_seq_show_inst(struct seq_file *s, struct nvdebug_state *g, char *prefix, uint64_t instance_ptr, enum INST_TARGET instance_target) {
27 instance_ctrl_t *inst = NULL;
28 context_switch_ctrl_t *ctxsw = NULL;
29 int i;
30
31#ifdef FALLBACK_TO_PRAMIN
32 bar0_window_t win;
33 win.raw = nvdebug_readl(g, NV_XAL_EP_BAR0_WINDOW_BASE);
34 inst = g->regs + NV_PRAMIN + addr_to_pramin_mut(g, instance_ptr, instance_target);
35#else
36 if (IS_ERR(inst = instance_deref(g, instance_ptr, instance_target)))
37 return PTR_ERR(ctxsw);
38#endif // FALLBACK_TO_PRAMIN
39 // If unable to access instance block, skip
40 if (!inst)
41 return 0;
42
43 // Print the channel instance block
44 // As an ID, use upper 52 bits of the instance address (lower 12 are zero)
45 //seq_printf(s, "%s+- Inst %-13llx-+\n", prefix, instance_ptr >> 12);
46 seq_printf(s, "%s|= Instance Block ====|\n", prefix);
47 seq_printf(s, "%s| Target Engine: %2d|\n", prefix, inst->fc_target);
48 seq_printf(s, "%s| Privileged: %1d|\n", prefix, inst->fc_config_is_priv);
49 seq_printf(s, "%s| Channel VEID: %2d|\n", prefix, inst->fc_chan_info_veid);
50 seq_printf(s, "%s| WFI PTR: |\n", prefix);
51 seq_printf(s, "%s| %#018llx|\n", prefix, (uint64_t)inst->engine_wfi_ptr << 12);
52 seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->engine_wfi_target));
53 seq_printf(s, "%s| Virtual address? %d|\n", prefix, inst->engine_wfi_is_virtual);
54 seq_printf(s, "%s| WFI VEID: %2d|\n", prefix, inst->engine_wfi_veid);
55 seq_printf(s, "%s| All PDB PTR: |\n", prefix);
56 seq_printf(s, "%s| %#018llx|\n", prefix, (u64)inst->pdb.page_dir << 12);
57 seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->pdb.target));
58 seq_printf(s, "%s| %20s|\n", prefix, inst->pdb.is_volatile ? "volatile" : "non-volatile");
59// seq_printf(s, "%s|raw: %0#10lx|\n", prefix, inst->pdb.raw);
60 seq_printf(s, "%s| Num subcontexts: %2ld|\n", prefix, hweight64(inst->subcontext_pdb_valid));
61 // Print configuration of every enabled subcontext
62 for (i = 0; i < 64; i++) {
63 // Skip subcontexts without their enable bit set
64 if (!(1 & (inst->subcontext_pdb_valid >> i)))
65 continue;
66 seq_printf(s, "%s| CPU SC%02d ASID%7d|\n", prefix, i, inst->subcontext[i].pasid);
67 seq_printf(s, "%s| SC%02d PDB PTR: |\n", prefix, i);
68 seq_printf(s, "%s| %#018llx|\n", prefix, ((u64)inst->subcontext[i].pdb.page_dir_hi << 32) | ((u64)inst->subcontext[i].pdb.page_dir_lo << 12));
69 seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->subcontext[i].pdb.target));
70 seq_printf(s, "%s| %20s|\n", prefix, inst->subcontext[i].pdb.is_volatile ? "volatile" : "non-volatile");
71// seq_printf(s, "%s|raw: %0#10lx|\n", prefix, inst->subcontext[i].pdb.raw);
72 }
73
74 // XXX: CTXSW is only accessible via PRAMIN. Accessing PRAMIN appears to
75 // either be broken, or race with the driver on Kepler (gk104 tested). So,
76 // do not attempt to touch the CTXSW block on Kepler.
77 // TODO: This check should be moved into addr_to_pramin_mut().
78 if (g->chip_id < NV_CHIP_ID_MAXWELL)
79 return 0;
80 // End XXX
81
82 if (IS_ERR(ctxsw = get_ctxsw(g, inst))) {
83#ifdef FALLBACK_TO_PRAMIN
84 nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
85#endif
86 return PTR_ERR(ctxsw);
87 }
88 // If unable to access CTXSW block, skip
89 if (!ctxsw) {
90#ifdef FALLBACK_TO_PRAMIN
91 nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
92#endif
93 return 0;
94 }
95 // Access and print the preemption mode and context ID
96 seq_printf(s, "%s|= Context State =====|\n", prefix);
97 seq_printf(s, "%s| Ctx. ID: %#10x|\n", prefix, ctxsw->context_id);
98 // No other CTXSW fields are supported pre-Pascal
99 if (g->chip_id < NV_CHIP_ID_PASCAL)
100 return 0;
101 seq_printf(s, "%s| Gfx. Preemption:%4s|\n", prefix,
102 graphics_preempt_type_to_text(ctxsw->graphics_preemption_options));
103 seq_printf(s, "%s| Cmp. Preemption:%4s|\n", prefix,
104 compute_preempt_type_to_text(ctxsw->compute_preemption_options));
105 seq_printf(s, "%s| #WFI Saves:%9d|\n", prefix, ctxsw->num_wfi_save_operations);
106 seq_printf(s, "%s| #CTA Saves:%9d|\n", prefix, ctxsw->num_cta_save_operations);
107 seq_printf(s, "%s| #GFXP Saves:%8d|\n", prefix, ctxsw->num_gfxp_save_operations);
108 seq_printf(s, "%s| #CILP Saves:%8d|\n", prefix, ctxsw->num_cilp_save_operations);
109#ifdef FALLBACK_TO_PRAMIN
110 nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
111#endif
112 return 0;
113}
114
10/* Print channel details using PCCSR (Programmable Channel Control System RAM?) 115/* Print channel details using PCCSR (Programmable Channel Control System RAM?)
11 @param s Pointer to state from seq_file subsystem to pass to seq_printf 116 @param s Pointer to state from seq_file subsystem to pass to seq_printf
12 @param g Pointer to our internal GPU state 117 @param g Pointer to our internal GPU state
@@ -32,16 +137,19 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state
32 seq_printf(s, "%s| %#018llx|\n", prefix, instance_ptr); 137 seq_printf(s, "%s| %#018llx|\n", prefix, instance_ptr);
33 seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target)); 138 seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target));
34 seq_printf(s, "%s| Instance bound: %d|\n", prefix, chan.inst_bind); 139 seq_printf(s, "%s| Instance bound: %d|\n", prefix, chan.inst_bind);
35 return 0; 140 // Print instance block
141 return runlist_detail_seq_show_inst(s, g, prefix, instance_ptr, chan.inst_target);
36} 142}
37 143
38/* `runlist_detail_seq_show_chan()`, but for Ampere+ 144/* `runlist_detail_seq_show_chan()`, but for Ampere+
145 @param instance_ptr Address for the channel instance block
146 @param instance_target Aperture of `instance_ptr`
39 @param runlist_pri_base Base of the RLRAM region for this runlist 147 @param runlist_pri_base Base of the RLRAM region for this runlist
40 148
41 `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on 149 `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on
42 Ampere+, and its location is configured in Runlist RAM. 150 Ampere+, and its location is configured in Runlist RAM.
43*/ 151*/
44static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base) { 152static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base, uint64_t instance_ptr, enum INST_TARGET instance_target) {
45 runlist_channel_config_t channel_config; 153 runlist_channel_config_t channel_config;
46 channel_ctrl_ga100_t chan; 154 channel_ctrl_ga100_t chan;
47 155
@@ -63,7 +171,7 @@ static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug
63 seq_printf(s, "%s| PBDMA Busy: %d|\n", prefix, chan.pbdma_busy); 171 seq_printf(s, "%s| PBDMA Busy: %d|\n", prefix, chan.pbdma_busy);
64 seq_printf(s, "%s| ENG Busy: %d|\n", prefix, chan.eng_busy); 172 seq_printf(s, "%s| ENG Busy: %d|\n", prefix, chan.eng_busy);
65 seq_printf(s, "%s| Acquire Fail: %d|\n", prefix, chan.acquire_fail); 173 seq_printf(s, "%s| Acquire Fail: %d|\n", prefix, chan.acquire_fail);
66 return 0; 174 return runlist_detail_seq_show_inst(s, g, prefix, instance_ptr, instance_target);
67} 175}
68#endif 176#endif
69 177
@@ -173,7 +281,7 @@ static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) {
173 if (g->chip_id < NV_CHIP_ID_AMPERE) 281 if (g->chip_id < NV_CHIP_ID_AMPERE)
174 runlist_detail_seq_show_chan(s, g, chid(g, entry), indt); 282 runlist_detail_seq_show_chan(s, g, chid(g, entry), indt);
175 else 283 else
176 runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base); 284 runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base, instance_ptr, inst_target(g, entry));
177#endif 285#endif
178 seq_printf(s, "%s+---------------------+\n", indt); 286 seq_printf(s, "%s+---------------------+\n", indt);
179 } 287 }
@@ -232,15 +340,17 @@ struct file_operations preempt_tsg_file_ops = {
232 340
233ssize_t resubmit_runlist_file_write(struct file *f, const char __user *buffer, 341ssize_t resubmit_runlist_file_write(struct file *f, const char __user *buffer,
234 size_t count, loff_t *off) { 342 size_t count, loff_t *off) {
235 uint32_t target_runlist; 343 uint32_t target_runlist, target_offset;
236 struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)]; 344 struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
237 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec 345 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
238 int err = kstrtou32_from_user(buffer, count, 0, &target_runlist); 346 int err = kstrtou32_from_user(buffer, count, 0, &target_offset);
239 if (err) 347 if (err)
240 return err; 348 return err;
349 // (Ab)use the PDE_DATA field for the runlist ID
350 target_runlist = file2gpuidx(f);
241 351
242 // resubmit_runlist() checks that target_runlist is valid 352 // resubmit_runlist() checks that target_runlist is valid
243 if ((err = resubmit_runlist(g, target_runlist))) 353 if ((err = resubmit_runlist(g, target_runlist, target_offset)))
244 return err; 354 return err;
245 355
246 return count; 356 return count;
@@ -351,6 +461,54 @@ struct file_operations enable_channel_file_ops = {
351 .llseek = default_llseek, 461 .llseek = default_llseek,
352}; 462};
353 463
464ssize_t comm_preempt_channel_file_write(struct file *f, const char __user *buf,
465 size_t count, loff_t *off,
466 enum COMPUTE_PREEMPT_TYPE mode) {
467 uint32_t target_channel, target_runlist;
468 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
469 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
470 int err = kstrtou32_from_user(buf, count, 0, &target_channel);
471 if (err)
472 return err;
473 // (Ab)use the PDE_DATA field used by file2gpuidx() for the runlist ID
474 target_runlist = file2gpuidx(f);
475 // Set preemption mode for the context of this channel
476 if ((err = set_channel_preemption_mode(g, target_channel, target_runlist, mode)))
477 return err;
478
479 return count;
480}
481
482ssize_t wfi_preempt_channel_file_write(struct file *f, const char __user *buf,
483 size_t count, loff_t *off) {
484 return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_WFI);
485}
486
487struct file_operations wfi_preempt_channel_file_ops = {
488 .write = wfi_preempt_channel_file_write,
489 .llseek = default_llseek,
490};
491
492ssize_t cta_preempt_channel_file_write(struct file *f, const char __user *buf,
493 size_t count, loff_t *off) {
494 return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_CTA);
495}
496
497struct file_operations cta_preempt_channel_file_ops = {
498 .write = cta_preempt_channel_file_write,
499 .llseek = default_llseek,
500};
501
502ssize_t cil_preempt_channel_file_write(struct file *f, const char __user *buf,
503 size_t count, loff_t *off) {
504 return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_CILP);
505}
506
507struct file_operations cil_preempt_channel_file_ops = {
508 .write = cil_preempt_channel_file_write,
509 .llseek = default_llseek,
510};
511
354// Tested working on Pascal (gp106) through Ada (ad102) 512// Tested working on Pascal (gp106) through Ada (ad102)
355ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer, 513ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
356 size_t count, loff_t *off) { 514 size_t count, loff_t *off) {
@@ -419,11 +577,13 @@ ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
419 577
420 // TODO: Fix the above for bare channels. Add "for_chan_until_tsg"? 578 // TODO: Fix the above for bare channels. Add "for_chan_until_tsg"?
421 } 579 }
580#warning switch_to_tsg has preempt_runlist omitted!
581 return count;
422 582
423 // Resubmit the runlist to ensure that changes to channel enablement are 583 // Resubmit the runlist to ensure that changes to channel enablement are
424 // picked up on Turing+ GPUs (channel enablements may not be otherwise). 584 // picked up on Turing+ GPUs (channel enablements may not be otherwise).
425 if (g->chip_id >= NV_CHIP_ID_TURING) 585 if (g->chip_id >= NV_CHIP_ID_TURING)
426 if ((err = resubmit_runlist(g, target_runlist))) 586 if ((err = resubmit_runlist(g, target_runlist, -1)))
427 return err; 587 return err;
428 588
429 // Trigger a runlist-level preempt to stop whatever was running, triggering 589 // Trigger a runlist-level preempt to stop whatever was running, triggering
@@ -438,3 +598,470 @@ struct file_operations switch_to_tsg_file_ops = {
438 .write = switch_to_tsg_file_write, 598 .write = switch_to_tsg_file_write,
439 .llseek = default_llseek, 599 .llseek = default_llseek,
440}; 600};
601
602ssize_t preempt_runlist_file_write(struct file *f, const char __user *buffer,
603 size_t count, loff_t *off) {
604 uint32_t target_runlist;
605 struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
606 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
607 int err = kstrtou32_from_user(buffer, count, 0, &target_runlist);
608 if (err)
609 return err;
610
611 // TODO: Check runlist is in-range
612 if ((err = preempt_runlist(g, target_runlist)))
613 return err;
614
615 return count;
616}
617
618struct file_operations preempt_runlist_file_ops = {
619 .write = preempt_runlist_file_write,
620 .llseek = default_llseek,
621};
622
623// Value written to this file is which runlist to ack the IRQ for
624ssize_t ack_bad_tsg_file_write(struct file *f, const char __user *buffer,
625 size_t count, loff_t *off) {
626 uint32_t target_runlist;
627 uint32_t rl_ram_off;
628 struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
629 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
630 int err = kstrtou32_from_user(buffer, count, 0, &target_runlist);
631 if (err)
632 return err;
633
634 if ((err = get_runlist_ram(g, target_runlist, &rl_ram_off)))
635 return err;
636
637 nvdebug_writel(g, rl_ram_off + 0x100, 1 << 12);
638
639 return count;
640}
641
642struct file_operations ack_bad_tsg_file_ops = {
643 .write = ack_bad_tsg_file_write,
644 .llseek = default_llseek,
645};
646
647// Rather than mapping all of BAR0, we just map:
648// - On Pascal, Volta, Turing: MC_BOOT, PFIFO, PCCSR, PTOP
649// - On Ampere: MC_BOOT, RAMRL(0), CHRAM(0), PTOP
650// "All CUDA-managed pointers are within---the first 40 bits of the process's
651// VA space" (Sec. 4.1, GPUDirect RDMA Documentation)
652// - This means 0x00ff_ffff_ffff is the highest valid CUDA virtual address,
653// and all higher addresses are unused.
654// - So we use 0x6000_0000_0000+; this falls within the first PDE3 entry, and
655// at the end of the PDE2 entries
656// + Using the second PDE3 entry did not appear to work on Jetson (IIRC)
657#define BAR0_USER_ADDR 0x0000700000000000llu
658#define MEM_USER_ADDR 0x0000600000000000llu
659
660/* Map all of GPU VRAM, and selected BAR0 regions, into a channel instance's
661 * virtual address space at predefined offsets (above).
662 *
663 * @param g Pointer to the nvdebug state for the selected GPU
664 * @param inst_ptr Dereferencible pointer to the channel's instance block
665 * @returns 0 on success, -errno on error
666 *
667 * Support: Pascal, Volta, Turing, Ampere
668 */
669int map_mem_for_instance(struct nvdebug_state *g, instance_ctrl_t *inst_ptr) {
670 int ret;
671 uintptr_t off, ram_size;
672 dma_addr_t bus_mc_boot_ram, bus_ptop_ram, bus_fifo_ram, bus_chan_ctrl_ram;
673 uint64_t mc_boot_ram, ptop_ram, fifo_ram, chan_ctrl_ram;
674 page_dir_config_t chan_pd_config;
675 memory_range_t mem_range;
676 uint32_t channel_ram_off, runlist_ram_off, channel_ram_size, bar0_base;
677 struct iommu_domain *dom;
678
679 if (g->chip_id >= NV_CHIP_ID_AMPERE) {
680 runlist_channel_config_t channel_config;
681 if ((ret = get_runlist_ram(g, 0, &runlist_ram_off))) {
682 printk(KERN_ERR "[nvdebug] %s: Unable to determine location of runlist0 RAM!\n", __func__);
683 return ret;
684 }
685 if (runlist_ram_off & 0xfff) {
686 printk(KERN_ERR "[nvdebug] %s: Runlist0 RAM is not page-aligned!\n", __func__);
687 return -EAFNOSUPPORT;
688 }
689 if ((channel_config.raw = nvdebug_readl(g, runlist_ram_off + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1)
690 return -EIO;
691 channel_ram_off = (uint32_t)channel_config.bar0_offset << 4;
692 if (channel_ram_off & 0xfff) {
693 printk(KERN_ERR "[nvdebug] %s: Runlist0 CHRAM is not page-aligned!\n", __func__);
694 return -EAFNOSUPPORT;
695 }
696 channel_ram_size = (1 << channel_config.num_channels_log2) * sizeof(channel_ctrl_ga100_t);
697 printk(KERN_DEBUG "[nvdebug] %s: Mapping CHRAM at %#018llx--%x and RLRAM at %#018llx--%x.\n", __func__, BAR0_USER_ADDR + channel_ram_off, channel_ram_size-1, BAR0_USER_ADDR + runlist_ram_off, 4095);
698 } else {
699 channel_ram_off = NV_PCCSR;
700 // MAX_CHID * sizeof(channel_ctrl_gf100_t) is < 4 KiB, so hardcode
701 channel_ram_size = 4096;
702 runlist_ram_off = NV_PFIFO;
703 }
704
705 // map_mem_by_chid() pulls the instance block via PRAMIN, so inst_ptr will
706 // be invalid after moving PRAMIN (eg. as part of a page table operation).
707 // To avoid accessing inst_ptr after invalidation, keep a copy of what we
708 // need.
709 chan_pd_config = inst_ptr->pdb;
710
711 // map_page_directory_v1() is unimplemented, precluding Maxwell (or older)
712 // support (as they don't support v2 page tables).
713 if (!chan_pd_config.is_ver2)
714 return -EOPNOTSUPP;
715
716 // Determine the size of GPU physical memory (VRAM).
717 if ((mem_range.raw = nvdebug_readl(g, NV_FB_MMU_LOCAL_MEMORY_RANGE)) == -1)
718 return -EIO;
719 ram_size = memory_range_to_bytes(mem_range);
720
721 // We map memory using huge pages, and thus do not support GPUs with
722 // non-2-MiB-divisible VID_MEM sizes.
723 if (ram_size % (1 << 21) != 0) {
724 printk(KERN_ERR "[nvdebug] %s: GPU VID_MEM of %lu bytes is not a multiple of 2 MiB!\n", __func__, ram_size);
725 return -EAFNOSUPPORT;
726 }
727
728 // Map all of physical GPU memory (VID_MEM) into this channels's GPU virtual
729 // address space using huge (2 MiB) pages.
730 for (off = 0; off < ram_size; off += (1 << 21)) {
731 if ((ret = map_page_directory(g, chan_pd_config,
732 MEM_USER_ADDR + off, off, TARGET_VID_MEM, true)) < 0)
733 return ret;
734 // If the mapping already exists for this page directory, the other
735 // mappings should already exist, and can be skipped.
736 if (ret == 1) {
737 printk(KERN_INFO "[nvdebug] %s: VRAM mapping from %llx to %lx already exists. Assuming all mappings already exist and returning early...\n", __func__, MEM_USER_ADDR + off, off);
738 return 0;
739 }
740 }
741
742 // Map Channel RAM to a GPU-accessible bus address (gets past any IOMMU or
743 // IOVA layers), then map that address into this channel's GPU virtual
744 // address space. NV_PCCSR_CHANNEL_INST(0) is 4k-aligned, so it can be
745 // directly mapped.
746 // XXX: All these mappings are currently returning -1 on all reads on
747 // sunlight, jbakita-old, jetson-xavier, jetson-orin, and bonham,
748 // which seems to be returned from the PCIe root (on PCIe GPUs).
749 if (g->pcid)
750 bar0_base = pci_resource_start(g->pcid, 0);
751 else if (g->platd)
752 bar0_base = platform_get_resource(g->platd, IORESOURCE_MEM, 0)->start;
753 else
754 return -ENOTRECOVERABLE;
755 mc_boot_ram = NV_MC_BOOT_0 + bar0_base;
756 // PTOP fits within a page, but not page-aligned; round down.
757 ptop_ram = (NV_PTOP & ~0xfffu) + bar0_base;
758 fifo_ram = runlist_ram_off + bar0_base;
759 chan_ctrl_ram = channel_ram_off + bar0_base;
760
761 // Check if GPU-accessible bus addresses are the same as CPU-visible physical
762 // addresses. Logic from amdgpu_device_check_iommu_direct_map().
763 dom = iommu_get_domain_for_dev(g->dev);
764 if (!dom || dom->type == IOMMU_DOMAIN_IDENTITY) {
765 // Used for: jbakita-old, sunlight, jetson-xavier, jetson-orin integrated, bonham, ?
766 // (For all these, reads on the mapping return only -1.)
767 // (Forcing these through dma_map_resource()/iommu_map() changes nothing)
768 // (Note that the `ls -l /sys/class/iommu/*/devices` also reports that the
769 // GPU is not available under the I/O MMU on these platforms.)
770 // To fix this, please enable AMD-Vi/ARM SMMU/Intel VT-d in your BIOS
771 // settings, UEFI settings, or device-tree file. Supported on:
772 // - AMD: Bulldozer+ (or Phenom II w/ 890FX or 990FX Chipset)
773 // - Intel: Most since Core2 Duo
774 // Note that while the Jetson Orin has an SMMU (I/O MMU), the GPU does not
775 // appear to be configured by any pre-provided device tree files to use the
776 // SMMU.
777 printk(KERN_INFO "[nvdebug] map_mem_ctxid: I/O MMU is unavailable/disabled for GPU %x. Assuming phys and bus addresses are identical...\n", g->chip_id);
778 bus_mc_boot_ram = mc_boot_ram;
779 bus_ptop_ram = ptop_ram;
780 bus_fifo_ram = fifo_ram;
781 bus_chan_ctrl_ram = chan_ctrl_ram;
782 } else {
783 printk(KERN_INFO "[nvdebug] map_mem_ctxid: I/O MMU is enabled. Attempting to use dma_map_resource()...\n");
784 // Used for: tama, yamaha
785 // Fails on tama, yamaha
786 // (Works on jetson-xavier, jetson-orin and bonham, but appears to be a no-op, and
787 // yields inaccessible memory. Get `mc-err: (255) csr_nvl7r: EMEM address decode error`
788 // on access on jetson boards, and a -1 read on all.)
789 bus_mc_boot_ram = dma_map_resource(g->dev, mc_boot_ram, 4096*2 /* *2 is a XXX hack to include PBUS */, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
790 bus_ptop_ram = dma_map_resource(g->dev, ptop_ram, 4096, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
791 bus_fifo_ram = dma_map_resource(g->dev, fifo_ram, 4096*8 /* *8 is a XXX hack */, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
792 bus_chan_ctrl_ram = dma_map_resource(g->dev, chan_ctrl_ram, 2*4096, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
793 if (dma_mapping_error(g->dev, bus_mc_boot_ram) ||
794 dma_mapping_error(g->dev, bus_ptop_ram) ||
795 dma_mapping_error(g->dev, bus_fifo_ram) ||
796 dma_mapping_error(g->dev, bus_chan_ctrl_ram)) {
797 // Used for: tama, yamaha
798 printk(KERN_WARNING "[nvdebug] map_mem_ctxid: Unable to map BAR0 addresses to device-accessible addresses via dma_map_resource(). Return codes: %d for MC_BOOT, %d for PFIFO, %d for PCCSR.\n",
799 dma_mapping_error(g->dev, bus_mc_boot_ram),
800 dma_mapping_error(g->dev, bus_fifo_ram),
801 dma_mapping_error(g->dev, bus_chan_ctrl_ram));
802 // This fallback does not appear to work on jbakita-old (5.4, GART IOMMU), but works on tama
803 if (!get_dma_ops(g->dev))
804 printk(KERN_WARNING "[nvdebug] Reason: No DMA `ops`, and direct mapping failed.\n");
805 else if (!get_dma_ops(g->dev)->map_resource)
806 // Fires on: tama, yamaha
807 printk(KERN_WARNING "[nvdebug] Reason: `map_resource` function undefined on this platform.\n");
808 if (!dom) {
809 printk(KERN_ERR "[nvdebug] map_mem_ctxid: No I/O MMU available and dma_map_resource() failed. Aborting mapping of BAR0 regions!\n");
810 return -ENOTRECOVERABLE;
811 }
812 printk(KERN_INFO "[nvdebug] map_mem_ctxid: Trying to fall back to direct I/O MMU manipulation...\n");
813 // XXX: Fallback to directly creating the I/O MMU mappings.
814 // This is necessary. Directly accessing BAR0 addresses throws I/O MMU
815 // errors in the kernel log on yamaha.
816 // See also: comment on kfd_mem_dmamap_sg_bo() in amdgpu
817 // Note: dma_map_resource -> map_resource -> [arm_]iommu_map_resource
818 // -> __iommu_dma_map -> iommu_map is the happy-path, but this seems to
819 // regularly fail, even though the iommu_map path works. One key
820 // difference is that the dma_map_resource() path also includes
821 // IOMMU_MMIO in the iommu_map() flags.
822 bus_mc_boot_ram = mc_boot_ram;
823 bus_ptop_ram = ptop_ram;
824 bus_fifo_ram = fifo_ram;
825 bus_chan_ctrl_ram = chan_ctrl_ram;
826 // Create identity mapping
827 ret = iommu_map(dom, mc_boot_ram, mc_boot_ram, 4096*2 /* *2 is a hack to fit in PBUS*/, IOMMU_READ | IOMMU_WRITE);
828 if (ret < 0) {
829 printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for MC_BOOT!\n");
830 return ret;
831 }
832 ret = iommu_map(dom, ptop_ram, ptop_ram, 4096, IOMMU_READ | IOMMU_WRITE);
833 if (ret < 0) {
834 printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for PTOP!\n");
835 return ret;
836 }
837 ret = iommu_map(dom, fifo_ram, fifo_ram, 4096*8 /* *8 is XXX hack*/, IOMMU_READ | IOMMU_WRITE);
838 if (ret < 0) {
839 printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for FIFO!\n");
840 return ret;
841 }
842 ret = iommu_map(dom, chan_ctrl_ram, chan_ctrl_ram, channel_ram_size, IOMMU_READ | IOMMU_WRITE);
843 if (ret < 0) {
844 printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for PCCSR!\n");
845 return ret;
846 }
847 }
848 }
849 // TARGET_SYS_MEM_NONCOHERENT tells the GPU to bypass the CPU L2 cache for
850 // accesses to this memory.
851 // "Clients should normally use [SYS_MEM_NON_COHERENT]" (nvgpu)
852 //
853 // "Non-coherent system memory.
854 // (GPU) MMU will NOT maintain coherence with CPU L2 cache.
855 // Higher-level APIs should only allow this when it is known
856 // the memory is not cacheable by CPU or the coherency is
857 // managed explicitly (e.g. w/ flushes in SW).
858 // Also consider that this path is not necessarily faster." (open-gpu-kernel-modules)
859 //
860 // "Coherent system memory.
861 // (GPU) MMU will snoop CPU L2 cache if possible.
862 // This is usually the safer choice over NONCOH since it works
863 // whether the memory is cached by CPU L2 or not.
864 // On some CPU architectures going through CPU L2 may
865 // even be faster than the non-coherent path." (open-gpu-kernel-modules)
866 //
867 // I suspect that that for SYS_MEM_NONCOHERENT mappings, the "no snoop"
868 // attribute bit will be set on associated PCIe read/write transactions.
869 //
870 // The only other bits in a PCIe read/write transaction that could be
871 // relevant are the two AT (Address Translation) bits added in PCIe 2.0.
872 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + NV_MC_BOOT_0,
873 bus_mc_boot_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
874 return ret;
875 // XXX
876 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + NV_MC_BOOT_0 + 4096,
877 bus_mc_boot_ram + 4096, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
878 return ret;
879 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + (NV_PTOP & ~0xfffu),
880 bus_ptop_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
881 return ret;
882 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + runlist_ram_off,
883 bus_fifo_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
884 return ret;
885 // XXX
886 for (off = 4096; off < 8*4096; off += 4096)
887 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + runlist_ram_off+off,
888 bus_fifo_ram+off, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
889 return ret;
890 // Channel control RAM can span two or more pages on Ampere+
891 for (off = 0; off < channel_ram_size; off += 4096)
892 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + channel_ram_off + off,
893 bus_chan_ctrl_ram + off, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
894 return ret;
895 return 0;
896}
897
898// Map by context ID
899// See constituent functions for info on what they do; comments not repeated.
900// Tested on Pascal, Volta, Turing, and Kepler
901ssize_t map_mem_ctxid_file_write(struct file *f, const char __user *buffer,
902 size_t count, loff_t *off) {
903 int err, target_context, target_runlist;
904 loff_t pos;
905 uint64_t instance_ptr;
906 enum INST_TARGET instance_target;
907 struct runlist_iter rl_iter;
908 instance_ctrl_t *inst;
909 context_switch_ctrl_t *ctx_block;
910 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
911 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
912 if ((err = kstrtou32_from_user(buffer, count, 0, &target_context)))
913 return err;
914 target_runlist = file2gpuidx(f);
915
916 // Get dereferencable pointer to the runlist
917 if ((err = get_runlist_iter(g, target_runlist, &rl_iter)))
918 return err;
919 // Find a channel in the runlist matching the provided context ID
920 for (pos = 0; pos < rl_iter.len; pos++, rl_iter.curr_entry += NV_RL_ENTRY_SIZE(g)) {
921 uint32_t ctxsw_timeout_pri_base = NV_PFIFO_ENG_CTXSW_TIMEOUT;
922 if (entry_type(g, rl_iter.curr_entry) == ENTRY_TYPE_TSG)
923 continue;
924 // Get instance block address
925 if (g->chip_id >= NV_CHIP_ID_AMPERE) {
926 instance_ptr = ((struct gv100_runlist_chan*)rl_iter.curr_entry)->inst_ptr_hi;
927 instance_ptr <<= 32;
928 instance_ptr |= (uint64_t)inst_ptr_lo(g, rl_iter.curr_entry) << 12;
929 instance_target = inst_target(g, rl_iter.curr_entry);
930 ctxsw_timeout_pri_base = rl_iter.runlist_pri_base + NV_RUNLIST_ENGINE_CTXSW_TIMEOUT_CONFIG(0);
931 } else {
932 channel_ctrl_t chan;
933 chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chid(g, rl_iter.curr_entry)));
934 if (chan.raw == -1)
935 return -EIO;
936 instance_ptr = (uint64_t)chan.inst_ptr << 12;
937 instance_target = chan.inst_target;
938 }
939 // Skip channels with unconfigured or INVALID instance blocks
940 if (!instance_ptr || instance_target == 1) {
941 printk(KERN_WARNING "[nvdebug] Channel %d is in runlist %d, but "
942 "lacks a valid instance block", chid(g, rl_iter.curr_entry),
943 target_runlist);
944 continue;
945 }
946
947 // Get a dereferencable pointer to the instance block
948 if (IS_ERR(inst = instance_deref(g, instance_ptr, instance_target)))
949 return PTR_ERR(inst);
950 // If unable to access instance block, skip
951 if (!inst)
952 continue;
953
954 // Get dereferencable pointer to CTXSW block
955 if (IS_ERR(ctx_block = get_ctxsw(g, inst)))
956 return PTR_ERR(ctx_block);
957 // If unable to access CTXSW block, skip
958 if (!ctx_block)
959 continue;
960 // Check if the context ID matches
961 if (ctx_block->context_id != target_context)
962 continue;
963
964 // XXX: Disable the context switch timeout while we're here
965 ctxsw_timeout_t timeout_config;
966 if ((timeout_config.raw = nvdebug_readl(g, ctxsw_timeout_pri_base)) == -1)
967 return -EIO;
968 timeout_config.enabled = 0;
969 nvdebug_writel(g, ctxsw_timeout_pri_base, timeout_config.raw);
970 // XXX: Attempt setting preemption mode while we're here
971 ctx_block->compute_preemption_options = PREEMPT_CTA;
972
973 // Map memory and return
974 if ((err = map_mem_for_instance(g, inst)) < 0)
975 return err;
976 return count;
977 }
978 return -ESRCH;
979}
980
981struct file_operations map_mem_ctxid_file_ops = {
982 .write = map_mem_ctxid_file_write,
983 .llseek = default_llseek,
984};
985
986// Map by channel ID (LEGACY; unclear if this needs to be kept)
987// Support: Pascal, Volta, and Turing only
988ssize_t map_mem_chid_file_write(struct file *f, const char __user *buffer,
989 size_t count, loff_t *off) {
990 int ret, target_channel;
991 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
992 channel_ctrl_t chan;
993 instance_ctrl_t *inst_ptr;
994 bool all = false;
995 uint64_t inst_ptr_off;
996 page_dir_config_t bar2_pd_config;
997 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
998 if ((ret = kstrtos32_from_user(buffer, count, 0, &target_channel)))
999 return ret;
1000
1001 if (g->chip_id >= NV_CHIP_ID_AMPERE)
1002 return -ENOSYS;
1003
1004 // This API is for nvsched, which is only supported on GPUs which support
1005 // instruction-level preemption (Pascal+).
1006 if (g->chip_id < NV_CHIP_ID_PASCAL)
1007 return -EOPNOTSUPP;
1008
1009 if (target_channel > MAX_CHID)
1010 return -ERANGE;
1011
1012 // Passing -1 indicates that all channels should be mapped
1013 if (target_channel == -1) {
1014 all = true;
1015 target_channel = 0;
1016 }
1017
1018 do {
1019 printk(KERN_INFO "[nvdebug] Mapping channel %d\n", target_channel);
1020 // Read the channel's configuration block, which includes the address of
1021 // this channel's instance block, which contains a page table pointer.
1022 // TODO: Verify this works with the channel RAM changes on Ampere+
1023 chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel));
1024 if (chan.raw == -1)
1025 return -EIO;
1026
1027 // If the instance pointer is unconfigured or the target is 1 (INVALID),
1028 // this channel is not in-use on any runlist and can be skipped.
1029 if (chan.inst_ptr == 0 || chan.inst_target == 1)
1030 continue;
1031
1032 // Find page tables which define how BAR2 offsets are tranlated to physical
1033 // VID_MEM/SYS_MEM addresses. (We have to do this every time since we reset
1034 // PRAMIN.)
1035 if ((ret = get_bar2_pdb(g, &bar2_pd_config)) < 0)
1036 return ret;
1037
1038 // Pascal+ GPUs use Version 2 page tables, so this shouldn't be a problem
1039 if (!bar2_pd_config.is_ver2)
1040 return -ENOSYS;
1041
1042 // To read the instance block, first find where it is mapped in BAR2
1043 if ((inst_ptr_off = search_page_directory(g, bar2_pd_config, (u64)chan.inst_ptr << 12, chan.inst_target)) == 0) {
1044 // If no mapping can be found in BAR2, fallback to accessing the
1045 // instance block via the PRAMIN window.
1046 printk(KERN_WARNING "[nvdebug] Warning: Channel %d has no instance "
1047 "block mapped in BAR2. Falling back to PRAMIN...\n", target_channel);
1048 if ((ret = addr_to_pramin_mut(g, (u64)chan.inst_ptr << 12, chan.inst_target)) < 0)
1049 return -EOPNOTSUPP;
1050 inst_ptr = g->regs + NV_PRAMIN + ret;
1051 } else {
1052 inst_ptr = g->bar2 + inst_ptr_off;
1053 }
1054
1055 if ((ret = map_mem_for_instance(g, inst_ptr)))
1056 return ret;
1057
1058 // If mapping all channels, start again at the next one
1059 } while (all && ++target_channel <= MAX_CHID);
1060
1061 return count;
1062}
1063
1064struct file_operations map_mem_chid_file_ops = {
1065 .write = map_mem_chid_file_write,
1066 .llseek = default_llseek,
1067};