diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2025-05-05 03:53:01 -0400 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2025-05-05 03:53:13 -0400 |
commit | 293430fcb5d4013b573556c58457ee706e482b7f (patch) | |
tree | 9328fa680f55b4e1a08d24714275b8437be3be5d | |
parent | 494df296bf4abe9b2b484bde1a4fad28c989afec (diff) |
Snapshot for ECRTS'25 artifact evaluation
-rw-r--r-- | Makefile | 3 | ||||
-rw-r--r-- | README.md | 10 | ||||
-rw-r--r-- | device_info_procfs.c | 79 | ||||
-rw-r--r-- | mmu.c | 414 | ||||
-rw-r--r-- | nvdebug.h | 293 | ||||
-rw-r--r-- | nvdebug_entry.c | 476 | ||||
-rw-r--r-- | nvdebug_linux.h | 5 | ||||
-rw-r--r-- | runlist.c | 275 | ||||
-rw-r--r-- | runlist_procfs.c | 645 |
9 files changed, 2154 insertions, 46 deletions
@@ -8,3 +8,6 @@ all: | |||
8 | make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules | 8 | make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules |
9 | clean: | 9 | clean: |
10 | make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean | 10 | make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean |
11 | |||
12 | nvdebug_user.so: runlist.c mmu.c bus.c nvdebug_user.c | ||
13 | gcc $< -shared -o $@ $(KBULID_CFLAGS) | ||
@@ -59,6 +59,7 @@ Not all these TPCs will necessarially be enabled in every GPC. | |||
59 | Use `cat gpcX_tpc_mask` to get a bit mask of which TPCs are disabled for GPC X. | 59 | Use `cat gpcX_tpc_mask` to get a bit mask of which TPCs are disabled for GPC X. |
60 | A set bit indicates a disabled TPC. | 60 | A set bit indicates a disabled TPC. |
61 | This API is only available on enabled GPCs. | 61 | This API is only available on enabled GPCs. |
62 | Bits greater than the number of on-chip TPCs per GPC should be ignored (it may appear than non-existent TPCs are "disabled"). | ||
62 | 63 | ||
63 | Example usage: To get the number of on-chip SMs on Volta+ GPUs, multiply the return of `cat num_gpcs` with `cat num_tpc_per_gpc` and multiply by 2 (SMs per TPC). | 64 | Example usage: To get the number of on-chip SMs on Volta+ GPUs, multiply the return of `cat num_gpcs` with `cat num_tpc_per_gpc` and multiply by 2 (SMs per TPC). |
64 | 65 | ||
@@ -83,6 +84,13 @@ Use `echo Z > runlistY/switch_to_tsg` to switch the GPU to run only the specifie | |||
83 | 84 | ||
84 | Use `echo Y > resubmit_runlist` to resubmit runlist Y (useful to prompt newer GPUs to pick up on re-enabled channels). | 85 | Use `echo Y > resubmit_runlist` to resubmit runlist Y (useful to prompt newer GPUs to pick up on re-enabled channels). |
85 | 86 | ||
87 | ## Error Interpretation | ||
88 | First check the kernel log to see if in includes more information about the error. | ||
89 | The following conventions are used for certain error codes: | ||
90 | |||
91 | - EIO, "Input/Output Error," is returned when an operation fails due to a bad register read. | ||
92 | - (Other errors may not have a consistent conventional meaning; see the implementation.) | ||
93 | |||
86 | ## General Codebase Structure | 94 | ## General Codebase Structure |
87 | - `nvdebug.h` defines and describes all GPU data structures. This does not depend on any kernel-internal headers. | 95 | - `nvdebug.h` defines and describes all GPU data structures. This does not depend on any kernel-internal headers. |
88 | - `nvdebug_entry.h` contains module startup, device detection, initialization, and module teardown logic. | 96 | - `nvdebug_entry.h` contains module startup, device detection, initialization, and module teardown logic. |
@@ -94,4 +102,4 @@ Use `echo Y > resubmit_runlist` to resubmit runlist Y (useful to prompt newer GP | |||
94 | 102 | ||
95 | - The runlist-printing API does not work when runlist management is delegated to the GPU System Processor (GSP) (most Turing+ datacenter GPUs). | 103 | - The runlist-printing API does not work when runlist management is delegated to the GPU System Processor (GSP) (most Turing+ datacenter GPUs). |
96 | To workaround, enable the `FALLBACK_TO_PRAMIN` define in `runlist.c`, or reload the `nvidia` kernel module with the `NVreg_EnableGpuFirmware=0` parameter setting. | 104 | To workaround, enable the `FALLBACK_TO_PRAMIN` define in `runlist.c`, or reload the `nvidia` kernel module with the `NVreg_EnableGpuFirmware=0` parameter setting. |
97 | (Eg. on A100: end all GPU-using processes, then `sudo rmmod nvidia_uvm nvidia; sudo modprobe nvidia NVreg_EnableGpuFirmware=0`.) | 105 | (Eg. on A100: end all GPU-using processes, then `sudo rmmod nvidia_drm nvidia_modeset nvidia_uvm nvidia; sudo modprobe nvidia NVreg_EnableGpuFirmware=0`.) |
diff --git a/device_info_procfs.c b/device_info_procfs.c index 4e4ab03..105e731 100644 --- a/device_info_procfs.c +++ b/device_info_procfs.c | |||
@@ -18,7 +18,7 @@ static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, | |||
18 | return 0; | 18 | return 0; |
19 | 19 | ||
20 | if ((read = nvdebug_readl(g, (uintptr_t)pde_data(file_inode(f)))) == -1) | 20 | if ((read = nvdebug_readl(g, (uintptr_t)pde_data(file_inode(f)))) == -1) |
21 | return -EOPNOTSUPP; | 21 | return -EIO; |
22 | // 32 bit register will always take less than 16 characters to print | 22 | // 32 bit register will always take less than 16 characters to print |
23 | chars_written = scnprintf(out, 16, "%#0x\n", read); | 23 | chars_written = scnprintf(out, 16, "%#0x\n", read); |
24 | if (copy_to_user(buf, out, chars_written)) | 24 | if (copy_to_user(buf, out, chars_written)) |
@@ -32,12 +32,85 @@ struct file_operations nvdebug_read_reg32_file_ops = { | |||
32 | .llseek = default_llseek, | 32 | .llseek = default_llseek, |
33 | }; | 33 | }; |
34 | 34 | ||
35 | typedef union { | ||
36 | struct { | ||
37 | uint8_t partitioning_select:2; | ||
38 | uint8_t table_select:2; | ||
39 | uint32_t pad_1:12; | ||
40 | uint8_t veid_offset:6; | ||
41 | uint32_t pad_2:2; | ||
42 | uint8_t table_offset:6; | ||
43 | uint32_t pad_3:2; | ||
44 | }; | ||
45 | uint32_t raw; | ||
46 | } partition_ctl_t; | ||
47 | |||
48 | static ssize_t nvdebug_read_part(struct file *f, char __user *buf, size_t size, loff_t *off) { | ||
49 | char out[12*64+2]; | ||
50 | int i, chars_written = 0; | ||
51 | partition_ctl_t part_ctl; | ||
52 | struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; | ||
53 | if (size < 16 || *off != 0) | ||
54 | return 0; | ||
55 | // 32 bit register will always take less than 16 characters to print | ||
56 | part_ctl.raw = nvdebug_readl(g, 0x00405b2c); | ||
57 | //part_ctl.partitioning_select = 0; // XXX XXX XXX Temp; 06/18/2024 | ||
58 | //part_ctl.table_select = 3; // 3 == ??? | ||
59 | //part_ctl.table_select = 2; // 2 == TBL_SEL_PARTITIONING_LMEM_BLK | ||
60 | part_ctl.table_select = 1; // 1 == TBL_SEL_PARTITIONING_ENABLE | ||
61 | //part_ctl.table_select = 0; // 0 == TBL_SEL_NONE | ||
62 | part_ctl.veid_offset = (uintptr_t)pde_data(file_inode(f)); // Range of [0, 0x3f], aka [0, 63] | ||
63 | for (i = 0; i < 64; i++) { | ||
64 | // Increment to next table offset in PARTITION_CTL | ||
65 | part_ctl.table_offset = i; | ||
66 | nvdebug_writel(g, 0x00405b2c, part_ctl.raw); | ||
67 | // Verify write applied to PARTITION_CTL | ||
68 | part_ctl.raw = nvdebug_readl(g, 0x00405b2c); | ||
69 | if (part_ctl.table_offset != i) | ||
70 | return -ENOTRECOVERABLE; | ||
71 | // Read PARTITION_DATA and print | ||
72 | // --- | ||
73 | // I get back 0x000000ff on Volta and 0x00000003 on Turing from | ||
74 | // PARTITION_DATA for all possible VEID_OFFSET, TBL_OFFSET, and TBL_SEL | ||
75 | // combinations. | ||
76 | // --- | ||
77 | // There's a 48-byte (12-word) gap after the address for PARTITION_DATA. | ||
78 | // Exploring this on Turing for TBL_SEL_PARTITIONING_ENABLE, VEID 1, 62, and | ||
79 | // 63, with CUDA_MPS_ACTIVE_THREAD_PERCENTAGE=5 for constant_cycles_kernel | ||
80 | // running under MPS: | ||
81 | // +0x0: 0x3 | ||
82 | // +0x4: 0 | ||
83 | // +0x8: 0x100 | ||
84 | // +0xC: 0 | ||
85 | // +0x10: 0xffffffff | ||
86 | // +0x14: 0 | ||
87 | // +0x18: 0 | ||
88 | // +0x1C: 0xffffffff | ||
89 | // +0x20: 0 | ||
90 | // +0x24: 0xffffffff | ||
91 | // +0x28: 0xffffffff | ||
92 | // +0x2C: 0xffffffff | ||
93 | chars_written += scnprintf(out + chars_written, 12, "%#010x ", nvdebug_readl(g, 0x00405b30)); | ||
94 | } | ||
95 | chars_written += scnprintf(out + chars_written, 2, "\n"); | ||
96 | if (copy_to_user(buf, out, chars_written)) | ||
97 | printk(KERN_WARNING "Unable to copy all data for %s\n", file_dentry(f)->d_name.name); | ||
98 | *off += chars_written; | ||
99 | return chars_written; | ||
100 | } | ||
101 | |||
102 | struct file_operations nvdebug_read_part_file_ops = { | ||
103 | .read = nvdebug_read_part, | ||
104 | .llseek = default_llseek, | ||
105 | }; | ||
106 | |||
35 | static ssize_t nvdebug_reg_range_read(struct file *f, char __user *buf, size_t size, loff_t *off) { | 107 | static ssize_t nvdebug_reg_range_read(struct file *f, char __user *buf, size_t size, loff_t *off) { |
36 | char out[12]; | 108 | char out[12]; |
37 | int chars_written; | 109 | int chars_written; |
38 | uint32_t read, mask; | 110 | uint32_t read, mask; |
39 | struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; | 111 | struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; |
40 | // See comment in nvdebug_entry.c to understand `union reg_range` | 112 | // `start_bit` is included, `stop_bit` is not, so to print lower eight bits |
113 | // from a register, use `start_bit = 0` and `stop_bit = 8`. | ||
41 | union reg_range range; | 114 | union reg_range range; |
42 | range.raw = (uintptr_t)pde_data(file_inode(f)); | 115 | range.raw = (uintptr_t)pde_data(file_inode(f)); |
43 | 116 | ||
@@ -47,7 +120,7 @@ static ssize_t nvdebug_reg_range_read(struct file *f, char __user *buf, size_t s | |||
47 | 120 | ||
48 | // Print bits `start_bit` to `stop_bit` from 32 bits at address `offset` | 121 | // Print bits `start_bit` to `stop_bit` from 32 bits at address `offset` |
49 | if ((read = nvdebug_readl(g, range.offset)) == -1) | 122 | if ((read = nvdebug_readl(g, range.offset)) == -1) |
50 | return -EOPNOTSUPP; | 123 | return -EIO; |
51 | // Setup `mask` used to throw out unused upper bits | 124 | // Setup `mask` used to throw out unused upper bits |
52 | mask = -1u >> (32 - range.stop_bit + range.start_bit); | 125 | mask = -1u >> (32 - range.stop_bit + range.start_bit); |
53 | // Throw out unused lower bits via a shift, apply the mask, and print | 126 | // Throw out unused lower bits via a shift, apply the mask, and print |
@@ -1,9 +1,13 @@ | |||
1 | /* Copyright 2024 Joshua Bakita | 1 | /* Copyright 2024 Joshua Bakita |
2 | * Helpers to deal with NVIDIA's MMU and associated page tables | 2 | * Helpers to deal with NVIDIA's MMU and associated page tables |
3 | */ | 3 | */ |
4 | #include <linux/dma-mapping.h> // dma_map_page() and dma_unmap_page() | ||
4 | #include <linux/err.h> // ERR_PTR() etc. | 5 | #include <linux/err.h> // ERR_PTR() etc. |
6 | #include <linux/gfp.h> // alloc_pages() | ||
5 | #include <linux/iommu.h> // iommu_get_domain_for_dev() and iommu_iova_to_phys() | 7 | #include <linux/iommu.h> // iommu_get_domain_for_dev() and iommu_iova_to_phys() |
6 | #include <linux/kernel.h> // Kernel types | 8 | #include <linux/kernel.h> // Kernel types |
9 | #include <linux/list.h> // struct list_head and associated functions | ||
10 | #include <linux/mm.h> // put_page() | ||
7 | 11 | ||
8 | #include "nvdebug.h" | 12 | #include "nvdebug.h" |
9 | 13 | ||
@@ -15,6 +19,11 @@ int g_verbose = 0; | |||
15 | #define printk_debug if (g_verbose >= 2) printk | 19 | #define printk_debug if (g_verbose >= 2) printk |
16 | #define printk_info if (g_verbose >= 1) printk | 20 | #define printk_info if (g_verbose >= 1) printk |
17 | 21 | ||
22 | // At least map_page_directory() assumes that pages are 4 KiB | ||
23 | #if PAGE_SIZE != 4096 | ||
24 | #error nvdebug assumes and requires a 4 KiB page size. | ||
25 | #endif | ||
26 | |||
18 | /* Convert a page directory (PD) pointer and aperture to be kernel-accessible | 27 | /* Convert a page directory (PD) pointer and aperture to be kernel-accessible |
19 | 28 | ||
20 | I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the | 29 | I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the |
@@ -22,7 +31,8 @@ int g_verbose = 0; | |||
22 | 31 | ||
23 | @param addr Pointer from page directory entry (PDE) | 32 | @param addr Pointer from page directory entry (PDE) |
24 | @param pd_ap PD-type aperture (target address space) for `addr` | 33 | @param pd_ap PD-type aperture (target address space) for `addr` |
25 | @return A dereferencable kernel address, or an ERR_PTR-wrapped error | 34 | @return A dereferencable kernel address, 0 if an I/O MMU is in use and has |
35 | no available mapping for the bus address, or an ERR_PTR-wrapped error | ||
26 | */ | 36 | */ |
27 | static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, | 37 | static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, |
28 | enum PD_TARGET pd_ap) { | 38 | enum PD_TARGET pd_ap) { |
@@ -56,7 +66,7 @@ static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, | |||
56 | // Check for, and translate through, the I/O MMU (if any) | 66 | // Check for, and translate through, the I/O MMU (if any) |
57 | if ((dom = iommu_get_domain_for_dev(g->dev))) { | 67 | if ((dom = iommu_get_domain_for_dev(g->dev))) { |
58 | phys = iommu_iova_to_phys(dom, addr); | 68 | phys = iommu_iova_to_phys(dom, addr); |
59 | printk_debug(KERN_DEBUG "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#lx to physical address %#llx.\n", addr, phys); | 69 | printk_debug(KERN_DEBUG "[nvdebug] %s: I/O MMU translated SYS_MEM I/O VA %#lx to physical address %#llx.\n", __func__, addr, phys); |
60 | } else | 70 | } else |
61 | phys = addr; | 71 | phys = addr; |
62 | 72 | ||
@@ -143,6 +153,327 @@ uint64_t search_page_directory(struct nvdebug_state *g, | |||
143 | return 0; | 153 | return 0; |
144 | } | 154 | } |
145 | 155 | ||
156 | /* GPU Virtual address -> Physical address ("forward" translation) for V2 tables | ||
157 | Index the page directories and tables used by the GPU MMU to determine which | ||
158 | physical address a given GPU virtual address has been mapped to. | ||
159 | |||
160 | The page directory and tables may be located in VID_MEM, SYS_MEM, or spread | ||
161 | across multiple apertures. | ||
162 | |||
163 | @param pd_config Page Directory configuration, containing pointer and | ||
164 | aperture for the start of the PDE3 entries | ||
165 | @param addr_to_find Virtual address to translate to a physical address | ||
166 | @param found_addr Where to store found physical address (0 if unfound) | ||
167 | @param found_aperture Where to store aperture of found physical address | ||
168 | @return 0 on success, -ENXIO if not found, and -errno on error. | ||
169 | */ | ||
170 | int translate_page_directory(struct nvdebug_state *g, | ||
171 | page_dir_config_t pd_config, | ||
172 | uint64_t addr_to_find, | ||
173 | uint64_t *found_addr /* out */, | ||
174 | enum INST_TARGET *found_aperture /* out */) { | ||
175 | page_dir_entry_t entry; | ||
176 | void __iomem *next_kva; | ||
177 | unsigned int level, pde_idx; | ||
178 | uintptr_t next = (uintptr_t)pd_config.page_dir << 12; | ||
179 | enum PD_TARGET next_target = INST2PD_TARGET(pd_config.target); | ||
180 | |||
181 | *found_addr = 0; | ||
182 | *found_aperture = TARGET_INVALID; | ||
183 | |||
184 | // Make sure that the query is page-aligned (likely mistake otherwise) | ||
185 | if (addr_to_find & 0xfff) { | ||
186 | printk(KERN_WARNING "[nvdebug] Attempting to translate unaligned address %#llx in translate_page_directory()!\n", addr_to_find); | ||
187 | return -EINVAL; | ||
188 | } | ||
189 | |||
190 | printk_info(KERN_INFO "[nvdebug] Translating addr %#018llx in V2 page table with base %#018llx\n", (u64)addr_to_find, (u64)next); | ||
191 | |||
192 | // Step through each PDE level and the PTE level | ||
193 | for (level = 0; level < 5; level++) { | ||
194 | // Index into this level | ||
195 | pde_idx = (addr_to_find >> NV_MMU_PT_V2_LSB[level]) & (NV_MMU_PT_V2_SZ[level] - 1); | ||
196 | printk_debug(KERN_DEBUG "[nvdebug] Using index %u in lvl %d\n", pde_idx, level); | ||
197 | // Hack to workaround PDE0 being double-size and strangely formatted | ||
198 | if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16) | ||
199 | next += 8; | ||
200 | // Obtain a kernel-dereferencable address | ||
201 | next_kva = pd_deref(g, next, next_target); | ||
202 | if (IS_ERR_OR_NULL(next_kva)) { | ||
203 | printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, next, pd_target_to_text(next_target), PTR_ERR(next_kva)); | ||
204 | return PTR_ERR(next_kva); | ||
205 | } | ||
206 | // Obtain entry at this level | ||
207 | entry.raw_w = readq(next_kva + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx); | ||
208 | if (entry.target == PD_AND_TARGET_INVALID) | ||
209 | return -ENXIO; | ||
210 | printk_debug(KERN_DEBUG "[nvdebug] Found %s pointing to %#018llx in ap '%s' at lvl %d (raw: %#018llx)\n", entry.is_pte ? "PTE" : "PDE", ((u64)entry.addr) << 12, pd_target_to_text(entry.target), level, entry.raw_w); | ||
211 | // Just return the physical address if this is the PTE level | ||
212 | if (entry.is_pte) { // level == 4 for 4 KiB pages, == 3 for 2 MiB | ||
213 | *found_addr = ((uint64_t)entry.addr) << 12; | ||
214 | *found_aperture = entry.aperture; | ||
215 | return 0; | ||
216 | } | ||
217 | // Otherwise step to the next table level | ||
218 | // TODO: Use addr_w as appropriate | ||
219 | next = (uint64_t)entry.addr << 12; | ||
220 | next_target = entry.target; | ||
221 | } | ||
222 | |||
223 | return 0; | ||
224 | } | ||
225 | |||
226 | // This struct is very special. We will never directly allocate this struct; | ||
227 | // its sole purpose is to provide more intuitive names to the offsets at which | ||
228 | // we store data in Linux's struct page. Such (ab)use of struct page is | ||
229 | // explictly permitted (see linux/mm_types.h). This struct is thus used by | ||
230 | // casting a pointer of struct page to a pointer of struct nvdebug_pd_page, | ||
231 | // then accessing the associated fields. This pointer may also be freely cast | ||
232 | // back to a sturct page pointer. | ||
233 | // We have 24 (32-bit) or 44 (64-bit) bytes available in the page struct | ||
234 | // (according to the documentation on struct page). Our comments indicate what | ||
235 | // available parts of struct page we repurpose for our own needs. | ||
236 | struct nvdebug_pd_page { | ||
237 | unsigned long __flags; // From struct page; do not touch! | ||
238 | // Overlaps struct page.lru | ||
239 | struct list_head list; // 4/8 bytes | ||
240 | // Overlaps struct page.mapping (and page.share on 32-bit) | ||
241 | uintptr_t parent_addr; // 8 bytes | ||
242 | // Overlaps struct page.share (page.private on 32-bit) | ||
243 | enum PD_TARGET parent_aperture; // 4 bytes | ||
244 | // Overlaps page.private (page.page_type on 32-bit) | ||
245 | dma_addr_t dma_addr; // 4/8 bytes | ||
246 | }; | ||
247 | |||
248 | /* Collect and free any now-unused page directory/table allocations | ||
249 | |||
250 | @param force Deallocate all page directories/tables created by this module, | ||
251 | no matter if they appear to be in-use or not. | ||
252 | @returns Number of freed pages on success, -errno on error. | ||
253 | */ | ||
254 | int gc_page_directory(struct nvdebug_state *g, bool force) { | ||
255 | struct nvdebug_pd_page *page, *_page; | ||
256 | void __iomem *parent_kva; | ||
257 | page_dir_entry_t parent_entry; | ||
258 | int freed_pages = 0; | ||
259 | |||
260 | // Depth-first traversal (from perspective of each page table) of page | ||
261 | // allocations. | ||
262 | // (This is depth-first because map_page_directory() always allocates and | ||
263 | // pushes page directory allocations before page table allocations.) | ||
264 | list_for_each_entry_safe_reverse(page, _page, &g->pd_allocs, list) { | ||
265 | printk_debug(KERN_DEBUG "[nvdebug] %s: Checking if page directory/table at %llx (SYS_MEM_?) with parent at %lx (%s) is unused...\n", __func__, page->dma_addr, page->parent_addr, pd_target_to_text(page->parent_aperture)); | ||
266 | // Try to determine if we're still in-use. We consider ourselves | ||
267 | // potentially in-use if our parent still points to us. | ||
268 | parent_kva = pd_deref(g, page->parent_addr, page->parent_aperture); | ||
269 | if (IS_ERR(parent_kva)) { | ||
270 | printk(KERN_ERR "[nvdebug] %s: Error resolving %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, page->parent_addr, pd_target_to_text(page->parent_aperture), PTR_ERR(parent_kva)); | ||
271 | return -ENOTRECOVERABLE; | ||
272 | } | ||
273 | // A NULL kva indicates parent no longer exists | ||
274 | parent_entry.raw_w = parent_kva ? readq(parent_kva) : 0; | ||
275 | // Page directory/table still in-use; do not free unless forced | ||
276 | if (parent_entry.addr_w == (page->dma_addr >> 12) && !force) | ||
277 | continue; | ||
278 | // Free this page table/directory and delete our parent's pointer to us | ||
279 | if (parent_entry.addr_w == (page->dma_addr >> 12)) { | ||
280 | printk(KERN_WARNING "[nvdebug] %s: Deleting page table/directory at %llx (SYS_MEM_?) with parent at %lx (%s) that may still be in-use!\n", __func__, page->dma_addr, page->parent_addr, pd_target_to_text(page->parent_aperture)); | ||
281 | writeq(0, parent_kva); | ||
282 | } | ||
283 | // Unmap, zero, free, and remove from tracking (these all return void) | ||
284 | dma_unmap_page(g->dev, page->dma_addr, PAGE_SIZE, DMA_TO_DEVICE); | ||
285 | memset(page_to_virt((struct page*)page), 0, PAGE_SIZE); | ||
286 | // Necessary to reset mapcount as we (ab)use its state for other things | ||
287 | page_mapcount_reset((struct page*)page); | ||
288 | // Same reset needed for mapping | ||
289 | ((struct page*)page)->mapping = NULL; | ||
290 | // Remove this page from our list of allocated pages | ||
291 | list_del(&page->list); | ||
292 | // Free the page | ||
293 | put_page((struct page*)page); | ||
294 | freed_pages++; | ||
295 | } | ||
296 | printk_debug(KERN_DEBUG "[nvdebug] %s: Freed %d pages.", __func__, freed_pages); | ||
297 | return freed_pages; | ||
298 | } | ||
299 | |||
300 | /* Map a GPU virtual address to a physical address in a GPU page table | ||
301 | Search for a mapping for specified GPU virtual address, and create a new one | ||
302 | if none is found. Automatically creates page directories and page table | ||
303 | entries as necessary. | ||
304 | |||
305 | The page directory and tables may be located in VID_MEM, SYS_MEM, or spread | ||
306 | across multiple apertures. | ||
307 | |||
308 | @param pd_config Page Directory configuration, containing pointer and | ||
309 | aperture for the start of the PDE3 entries | ||
310 | @param vaddr_to_find Virtual address to check, and map to a physical address | ||
311 | if nothing is already mapped (up to 49 bits long) | ||
312 | @param paddr_to_map Physical address to use (up to 36 bits long if VID_MEM, | ||
313 | and up to 58 bits if SYS_MEM) | ||
314 | @param paddr_target Which space does the physical address refer to? | ||
315 | @param huge_page Set to map a 2 MiB, rather than 4 KiB, page | ||
316 | @return 0 on success, 1 if mapping already exists, -EADDRINUSE if virtual | ||
317 | address is already mapped to something else, and -errno on error | ||
318 | */ | ||
319 | int map_page_directory(struct nvdebug_state *g, | ||
320 | page_dir_config_t pd_config, | ||
321 | uint64_t vaddr_to_find, | ||
322 | uint64_t paddr_to_map, | ||
323 | enum INST_TARGET paddr_target, | ||
324 | bool huge_page) { | ||
325 | page_dir_entry_t entry; | ||
326 | void __iomem *next_kva; | ||
327 | unsigned int level, pde_idx; | ||
328 | uintptr_t next = (uintptr_t)pd_config.page_dir << 12; | ||
329 | enum PD_TARGET next_target = INST2PD_TARGET(pd_config.target); | ||
330 | |||
331 | // Make sure that the query is page-aligned (likely mistake otherwise) | ||
332 | if ((vaddr_to_find & 0xfff || paddr_to_map & 0xfff) | ||
333 | || (huge_page && (vaddr_to_find & 0x1fffff || paddr_to_map & 0x1fffff))) { | ||
334 | printk(KERN_WARNING "[nvdebug] %s: Attempting to map an unaligned address (physical %#018llx or virtual %#018llx)! Failing...\n", __func__, paddr_to_map, vaddr_to_find); | ||
335 | return -EINVAL; | ||
336 | } | ||
337 | |||
338 | // NVIDIA supports up to 49-bit virtual addresss | ||
339 | // Except Jetson Xavier only seems to be able to resolve 47-bit addresses? | ||
340 | if (vaddr_to_find >> 49) { | ||
341 | printk(KERN_WARNING "[nvdebug] %s: vaddr_to_find (%#018llx) is beyond the 49-bit virtual address space supported by the GPU! Failing...\n", __func__, vaddr_to_find); | ||
342 | return -EINVAL; | ||
343 | } | ||
344 | |||
345 | // NVIDIA supports up to 36-bit VID_MEM addresses | ||
346 | if (paddr_target == TARGET_VID_MEM && paddr_to_map >> 36) { | ||
347 | printk(KERN_WARNING "[nvdebug] %s: paddr_to_map (%#018llx) is beyond the 36-bit VID_MEM address space! Failing...\n", __func__, paddr_to_map); | ||
348 | return -EINVAL; | ||
349 | } | ||
350 | |||
351 | // NVIDIA supports up to 58-bit SYS_MEM addresses | ||
352 | if ((paddr_target == TARGET_SYS_MEM_COHERENT || | ||
353 | paddr_target == TARGET_SYS_MEM_NONCOHERENT) && paddr_to_map >> 58) { | ||
354 | printk(KERN_WARNING "[nvdebug] %s: paddr_to_map (%#018llx) is beyond the 58-bit SYS_MEM address space! Failing...\n", __func__, paddr_to_map); | ||
355 | return -EINVAL; | ||
356 | } | ||
357 | |||
358 | // We don't support mapping to PEERs; that requires a PEER ID | ||
359 | if (paddr_target == TARGET_PEER) { | ||
360 | printk(KERN_WARNING "[nvdebug] %s: paddr_target must be SYS_MEM_* or VID_MEM! Failing...\n", __func__); | ||
361 | return -EINVAL; | ||
362 | } | ||
363 | |||
364 | printk_info(KERN_INFO "[nvdebug] Mapping addr %#018llx in page table with base %#018llx to %s address %#018llx\n", vaddr_to_find, (u64)next, target_to_text(paddr_target), paddr_to_map); | ||
365 | |||
366 | // Step through each PDE level and the PTE level | ||
367 | for (level = 0; level < 5; level++) { | ||
368 | // Index into this level | ||
369 | pde_idx = (vaddr_to_find >> NV_MMU_PT_V2_LSB[level]) & (NV_MMU_PT_V2_SZ[level] - 1); | ||
370 | printk_debug(KERN_DEBUG "[nvdebug] In table at KVA %#lx, using index %u in lvl %d\n", (uintptr_t)next, pde_idx, level); | ||
371 | // Hack to workaround PDE0 being double-size and strangely formatted | ||
372 | if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16) | ||
373 | next += 8; | ||
374 | // Obtain a kernel-dereferencable address | ||
375 | next_kva = pd_deref(g, next, next_target); | ||
376 | if (IS_ERR_OR_NULL(next_kva)) { | ||
377 | printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, next, pd_target_to_text(next_target), PTR_ERR(next_kva)); | ||
378 | return -ENOTRECOVERABLE; | ||
379 | } | ||
380 | // Obtain entry at this level | ||
381 | entry.raw_w = readq(next_kva + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx); | ||
382 | // If pointer to next level of the table does not exist | ||
383 | if (entry.target == PD_AND_TARGET_INVALID) { // PTE or PD covered by PD_AND_TARGET_INVALID | ||
384 | if (level == 4 || (huge_page && level == 3)) { | ||
385 | // Create new PTE (allocation, as needed, is handled at level 2 or 3) | ||
386 | // Targets observed in page tables: | ||
387 | // For PCIe: entry.target == PTE_AND_TARGET_VID_MEM; | ||
388 | // For Jetson: entry.target == PTE_AND_TARGET_SYS_MEM_NONCOHERENT; | ||
389 | entry.is_pte = 1; | ||
390 | entry.aperture = paddr_target; | ||
391 | if (paddr_target == TARGET_VID_MEM) | ||
392 | entry.addr = paddr_to_map >> 12; | ||
393 | else | ||
394 | entry.addr_w = paddr_to_map >> 12; | ||
395 | // Set the volatile bit (as NVRM does for SYS_MEM_COHERENT mappings) | ||
396 | // (This does nothing if the target is VID_MEM, but if the target is | ||
397 | // SYS_MEM_*, accesses will bypass the L2.) | ||
398 | entry.is_volatile = 1; | ||
399 | // Leave other fields zero, yielding an unencrypted, unprivileged, r/w, | ||
400 | // volatile mapping with atomics enabled. | ||
401 | |||
402 | // XXX: Hack to work around PDE0 double-size weirdness. Huge | ||
403 | // page mapping will fault without this. | ||
404 | if (level == 3) | ||
405 | writeq(entry.raw_w, next_kva - 8 + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx); | ||
406 | } else { | ||
407 | struct page* page_dir; | ||
408 | struct nvdebug_pd_page* page_dir_reinterpret; | ||
409 | dma_addr_t page_dir_dma; | ||
410 | // Allocate one 4 KiB all-zero (all invalid) page directory/ | ||
411 | // table at the next level | ||
412 | if (!(page_dir = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0))) | ||
413 | return -ENOMEM; | ||
414 | // Obtain a GPU-accessible/bus address for this page (handling | ||
415 | // I/O MMU mappings, etc.) | ||
416 | page_dir_dma = dma_map_page(g->dev, page_dir, 0, PAGE_SIZE, DMA_TO_DEVICE); | ||
417 | // Verify that we were able to create a mapping | ||
418 | if (dma_mapping_error(g->dev, page_dir_dma)) | ||
419 | return dma_mapping_error(g->dev, page_dir_dma); | ||
420 | // Record this allocation for freeing later | ||
421 | // Note: Linux maintains a page struct for every page in the | ||
422 | // system. This struct has available space that drivers | ||
423 | // can use to store their own tracking information. Our | ||
424 | // struct nvdebug_pd_page facilitates this. | ||
425 | page_dir_reinterpret = (struct nvdebug_pd_page*)page_dir; | ||
426 | page_dir_reinterpret->parent_addr = next + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx; | ||
427 | page_dir_reinterpret->parent_aperture = next_target; | ||
428 | page_dir_reinterpret->dma_addr = page_dir_dma; | ||
429 | list_add(&page_dir_reinterpret->list, &g->pd_allocs); | ||
430 | // Point this entry to the new directory/table | ||
431 | entry.target = PD_AND_TARGET_SYS_MEM_COHERENT; // Observed in page tables | ||
432 | // Must use addr_w with SYS_MEM targets | ||
433 | entry.addr_w = page_dir_dma >> 12; | ||
434 | // On Jetson and NVRM, all PDEs are marked volatile | ||
435 | entry.is_volatile = 1; | ||
436 | // We don't configure ATS, so disable ATS lookups for speed. | ||
437 | entry.no_ats = 1; | ||
438 | } | ||
439 | writeq(entry.raw_w, next_kva + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx); | ||
440 | printk_debug(KERN_DEBUG "[nvdebug] Created %s pointing to %llx in ap '%s' at lvl %d (raw: %#018llx)\n", entry.is_pte ? "PTE" : "PDE", ((u64)entry.addr) << 12, pd_target_to_text(entry.target), level, entry.raw_w); | ||
441 | // Successfully created the requested PTE, so return | ||
442 | if (entry.is_pte) | ||
443 | return 0; | ||
444 | } else { | ||
445 | printk_debug(KERN_DEBUG "[nvdebug] Found %s pointing to %llx in ap '%s' at lvl %d (raw: %#018llx)\n", entry.is_pte ? "PTE" : "PDE", ((u64)entry.addr) << 12, pd_target_to_text(entry.target), level, entry.raw_w); | ||
446 | } | ||
447 | |||
448 | // If this is the PTE level, return success if the address and target are correct | ||
449 | if (entry.is_pte) { // level == 4 for 4 KiB pages, == 3 for 2 MiB | ||
450 | if (entry.aperture != paddr_target) | ||
451 | return -EADDRINUSE; // Also handles PEER | ||
452 | if (entry.aperture == TARGET_VID_MEM) | ||
453 | return (uint64_t)entry.addr == paddr_to_map >> 12 ? 1 : -EADDRINUSE; | ||
454 | else | ||
455 | return entry.addr_w == paddr_to_map >> 12 ? 1 : -EADDRINUSE; // SYS_MEM is wider | ||
456 | } | ||
457 | |||
458 | // If mapping a 2 MiB page and we made it here, level 3 had a PDE. This | ||
459 | // means that the requested 2 MiB virtual region already has one or more | ||
460 | // small pages mapped within it---a.k.a., the addresses are in use. | ||
461 | // If we didn't bail out here, the above logic would attempt to fallback | ||
462 | // to a 4 KiB mapping, which would be unexpected behavior. | ||
463 | if (huge_page && level == 3) | ||
464 | return -EADDRINUSE; | ||
465 | |||
466 | // Otherwise step to the next table level | ||
467 | if (entry.aperture == TARGET_VID_MEM) | ||
468 | next = (uint64_t)entry.addr << 12; | ||
469 | else | ||
470 | next = (uint64_t)entry.addr_w << 12; // SYS_MEM is wider | ||
471 | next_target = entry.target; | ||
472 | } | ||
473 | |||
474 | return -ENOTRECOVERABLE; // Should be impossible | ||
475 | } | ||
476 | |||
146 | /* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables | 477 | /* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables |
147 | (See `search_page_directory()` for documentation.) | 478 | (See `search_page_directory()` for documentation.) |
148 | */ | 479 | */ |
@@ -187,7 +518,7 @@ uint64_t search_v1_page_directory(struct nvdebug_state *g, | |||
187 | // Verify PDE is present | 518 | // Verify PDE is present |
188 | if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID) | 519 | if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID) |
189 | continue; | 520 | continue; |
190 | // printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw); | 521 | // TODO: Handle huge pages |
191 | printk_debug(KERN_DEBUG "[nvdebug] Found %s PDE at index %lld pointing to PTEs @ %#018llx in ap '%d' (raw: %#018llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", i, ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw); | 522 | printk_debug(KERN_DEBUG "[nvdebug] Found %s PDE at index %lld pointing to PTEs @ %#018llx in ap '%d' (raw: %#018llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", i, ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw); |
192 | // For each PTE | 523 | // For each PTE |
193 | for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) { | 524 | for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) { |
@@ -215,7 +546,84 @@ uint64_t search_v1_page_directory(struct nvdebug_state *g, | |||
215 | return 0; | 546 | return 0; |
216 | } | 547 | } |
217 | 548 | ||
549 | /* GPU Virtual address -> Physical address ("forward" translation) for V1 tables | ||
550 | (See `translate_page_directory()` for documentation.) | ||
551 | */ | ||
552 | int translate_v1_page_directory(struct nvdebug_state *g, | ||
553 | page_dir_config_t pd_config, | ||
554 | uint64_t addr_to_find, | ||
555 | uint64_t *found_addr /* out */, | ||
556 | enum INST_TARGET *found_aperture /* out */) { | ||
557 | page_dir_entry_v1_t pde; | ||
558 | page_tbl_entry_v1_t pte; | ||
559 | uintptr_t pde_idx, pde_phys, pte_idx, pte_phys; | ||
560 | void __iomem *pte_kva, *pde_kva; | ||
561 | |||
562 | *found_addr = 0; | ||
563 | *found_aperture = TARGET_INVALID; | ||
564 | |||
565 | // Make sure that the query is page-aligned (likely mistake otherwise) | ||
566 | if (addr_to_find & 0xfff) { | ||
567 | printk(KERN_WARNING "[nvdebug] Attempting to translate unaligned address %#llx in translate_v1_page_directory()!\n", addr_to_find); | ||
568 | return -EINVAL; | ||
569 | } | ||
570 | |||
571 | // This function only understands the Page Table Version 1 format | ||
572 | if (pd_config.is_ver2) { | ||
573 | printk(KERN_ERR "[nvdebug] Passed a Version 2 page table at %#018llx to translate_v1_page_directory()!\n", (uint64_t)pd_config.page_dir << 12); | ||
574 | return -EINVAL; | ||
575 | } | ||
576 | |||
577 | // We only understand the Version 1 format when 128 KiB huge pages are in-use | ||
578 | if (pd_config.is_64k_big_page) { | ||
579 | printk(KERN_ERR "[nvdebug] Page Table Version 1 with 64 KiB huge pages is unsupported!\n"); | ||
580 | return -EINVAL; | ||
581 | } | ||
582 | |||
583 | printk_info(KERN_INFO "[nvdebug] Translating addr %#018llx in V1 page table with base %#018llx\n", (uint64_t)addr_to_find, (uint64_t)pd_config.page_dir << 12); | ||
584 | |||
585 | // Shift bits which define PDE index to start at bit 0, and mask other bits | ||
586 | pde_idx = (addr_to_find >> NV_MMU_PT_V1_LSB[0]) & (NV_MMU_PT_V1_SZ[0] - 1); | ||
587 | // Compute VID_MEM/SYS_MEM address of page directory entry | ||
588 | pde_phys = ((uint64_t)pd_config.page_dir << 12) + pde_idx * sizeof(page_dir_entry_v1_t); | ||
589 | // Convert VID_MEM/SYS_MEM address to Kernel-accessible Virtual Address (KVA) | ||
590 | pde_kva = pd_deref(g, pde_phys, INST2PD_TARGET(pd_config.target)); | ||
591 | if (IS_ERR_OR_NULL(pde_kva)) { | ||
592 | printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_phys, target_to_text(pd_config.target), PTR_ERR(pde_kva)); | ||
593 | return PTR_ERR(pde_kva); | ||
594 | } | ||
595 | // Read page directory entry (readq seems to work fine; tested on GM204) | ||
596 | pde.raw = readq(pde_kva); | ||
597 | // Verify this PDE points to an array of page table entries | ||
598 | if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID) | ||
599 | return -ENXIO; | ||
600 | // TODO: Check for and handle huge pages | ||
601 | printk_debug(KERN_DEBUG "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw); | ||
602 | |||
603 | // Shift bits which define PTE index to start at bit 0, and mask other bits | ||
604 | pte_idx = (addr_to_find >> NV_MMU_PT_V1_LSB[1]) & (NV_MMU_PT_V1_SZ[1] - 1); | ||
605 | // Compute VID_MEM/SYS_MEM address of page table entry | ||
606 | pte_phys = ((uint64_t)pde.alt_addr << 12) + pte_idx * sizeof(page_tbl_entry_v1_t); | ||
607 | // Convert VID_MEM/SYS_MEM address to Kernel-accessible Virtual Address (KVA) | ||
608 | pte_kva = pd_deref(g, pte_phys, V12PD_TARGET(pde.alt_target)); | ||
609 | if (IS_ERR_OR_NULL(pde_kva)) { | ||
610 | printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pte_phys, pd_target_to_text(pde.alt_target), PTR_ERR(pte_kva)); | ||
611 | return PTR_ERR(pte_kva); | ||
612 | } | ||
613 | // Read page table entry | ||
614 | pte.raw = readq(pte_kva); | ||
615 | // XXX: The above readq() is bogus on gk104 (returns -1). Potential issue of pd_deref's move of PRAMIN racing with the driver? | ||
616 | if (!pte.is_present) | ||
617 | return -ENXIO; | ||
618 | printk_debug(KERN_DEBUG "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)pte.addr) << 12, target_to_text(pte.target), pte.is_volatile, pte.is_privileged, pte.is_readonly, pte.atomics_disabled, pte.raw); | ||
619 | // Access PTE and return physical address | ||
620 | *found_addr = (uint64_t)pte.addr << 12; | ||
621 | *found_aperture = pte.target; | ||
622 | return 0; | ||
623 | } | ||
624 | |||
218 | /* *** UNTESTED *** | 625 | /* *** UNTESTED *** |
626 | // This is only relevant on pre-Kepler GPUs; not a current priority | ||
219 | #define NV_MMU_PT_V0_SZ 2048 | 627 | #define NV_MMU_PT_V0_SZ 2048 |
220 | #define NV_MMU_PT_V0_LSB 29 | 628 | #define NV_MMU_PT_V0_LSB 29 |
221 | uint64_t search_v0_page_directory(struct nvdebug_state *g, | 629 | uint64_t search_v0_page_directory(struct nvdebug_state *g, |
@@ -2,6 +2,7 @@ | |||
2 | * SPDX-License-Identifier: MIT | 2 | * SPDX-License-Identifier: MIT |
3 | * | 3 | * |
4 | * File outline: | 4 | * File outline: |
5 | * - Configuration options | ||
5 | * - Runlist, preemption, and channel control (FIFO) | 6 | * - Runlist, preemption, and channel control (FIFO) |
6 | * - Basic GPU information (MC) | 7 | * - Basic GPU information (MC) |
7 | * - Detailed GPU information (PTOP, FUSE, and CE) | 8 | * - Detailed GPU information (PTOP, FUSE, and CE) |
@@ -20,6 +21,27 @@ | |||
20 | // this, so declare as incomplete type to avoid pulling in the nvgpu headers. | 21 | // this, so declare as incomplete type to avoid pulling in the nvgpu headers. |
21 | struct gk20a; | 22 | struct gk20a; |
22 | 23 | ||
24 | // Uncomment to, upon BAR2 access failure, return a PRAMIN-based runlist pointer | ||
25 | // in get_runlist_iter(). In order for this pointer to remain valid, PRAMIN | ||
26 | // **must** not be moved during runlist traversal. | ||
27 | // - The Jetson TX2 has no BAR2, and stores the runlist in VID_MEM, so this | ||
28 | // must be enabled to print the runlist on the TX2. | ||
29 | // - On the A100 in Google Cloud and H100 in Paperspace, as of Aug 2024, this is | ||
30 | // needed, as nvdebug is not finding (at least) runlist0 mapped in BAR2/3. | ||
31 | // Automatically disables printing Instance Block and Context State while | ||
32 | // traversing the runlist, as these require conflicting uses of PRAMIN (it's | ||
33 | // needed to search the page tables for the Instance Block in BAR2/3, and to | ||
34 | // access anything in the Context State---aka CTXSW). | ||
35 | #define FALLBACK_TO_PRAMIN | ||
36 | |||
37 | // Starting offset for registers in the corresponding named range | ||
38 | // Programmable First-In First-Out unit; also known as "Host" | ||
39 | #define NV_PFIFO 0x00002000 // 8 KiB long; ends prior to 0x00004000 | ||
40 | // Programmable Channel Control System RAM | ||
41 | #define NV_PCCSR 0x00800000 // 16 KiB long; ends prior to 0x00810000 | ||
42 | // Programmable TOPology registers | ||
43 | #define NV_PTOP 0x00022400 // 1 KiB long; ends prior to 0x00022800 | ||
44 | |||
23 | /* Runlist Channel | 45 | /* Runlist Channel |
24 | A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue | 46 | A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue |
25 | of GPU commands. These commands are typically queued from userspace. | 47 | of GPU commands. These commands are typically queued from userspace. |
@@ -202,7 +224,7 @@ typedef union { | |||
202 | Support: Ampere, Hopper, Ada, [newer untested] | 224 | Support: Ampere, Hopper, Ada, [newer untested] |
203 | */ | 225 | */ |
204 | #define NV_RUNLIST_PREEMPT_GA100 0x098 | 226 | #define NV_RUNLIST_PREEMPT_GA100 0x098 |
205 | #define PREEMPT_TYPE_RUNLIST 0 | 227 | #define PREEMPT_TYPE_RUNLIST PREEMPT_TYPE_CHANNEL |
206 | 228 | ||
207 | /* | 229 | /* |
208 | "Initiate a preempt of the engine by writing the bit associated with its | 230 | "Initiate a preempt of the engine by writing the bit associated with its |
@@ -355,6 +377,14 @@ typedef union { | |||
355 | uint64_t raw; | 377 | uint64_t raw; |
356 | } runlist_base_tu102_t; | 378 | } runlist_base_tu102_t; |
357 | 379 | ||
380 | /* | ||
381 | LEN : Read/Write | ||
382 | OFFSET : Read/Write | ||
383 | PREEMPTED_TSGID : Read-only | ||
384 | VALID_PREEMPTED_TSGID : Read-only | ||
385 | IS_PENDING : Read-only | ||
386 | PREEMPTED_OFFSET : Read-only | ||
387 | */ | ||
358 | typedef union { | 388 | typedef union { |
359 | struct { | 389 | struct { |
360 | uint16_t len:16; | 390 | uint16_t len:16; |
@@ -416,6 +446,27 @@ typedef union { | |||
416 | uint32_t raw; | 446 | uint32_t raw; |
417 | } runlist_channel_config_t; | 447 | } runlist_channel_config_t; |
418 | 448 | ||
449 | /* Context Switch Timeout Configuration | ||
450 | After a task's budget expires, there's a configurable grace period, a | ||
451 | "timeout", within which the context needs to complete. After this timeout | ||
452 | expires, an interrupt is raised to terminate the task. | ||
453 | |||
454 | This register configures if such a timeout is enabled and how long the | ||
455 | timeout is (the "period"). | ||
456 | |||
457 | Support: Volta, Turing | ||
458 | */ | ||
459 | #define NV_PFIFO_ENG_CTXSW_TIMEOUT 0x00002A0C | ||
460 | // Support: Ampere | ||
461 | #define NV_RUNLIST_ENGINE_CTXSW_TIMEOUT_CONFIG(i) (0x220+(i)*64) | ||
462 | typedef union { | ||
463 | struct { | ||
464 | uint32_t period:31; | ||
465 | bool enabled:1; | ||
466 | } __attribute__((packed)); | ||
467 | uint32_t raw; | ||
468 | } ctxsw_timeout_t; | ||
469 | |||
419 | /* Programmable Channel Control System RAM (PCCSR) | 470 | /* Programmable Channel Control System RAM (PCCSR) |
420 | 512-entry array of channel control and status data structures. | 471 | 512-entry array of channel control and status data structures. |
421 | 472 | ||
@@ -477,8 +528,15 @@ typedef union { | |||
477 | bool busy:1; | 528 | bool busy:1; |
478 | uint32_t :3; | 529 | uint32_t :3; |
479 | } __attribute__((packed)); | 530 | } __attribute__((packed)); |
531 | struct { | ||
532 | uint32_t word1; | ||
533 | uint32_t word2; | ||
534 | } __attribute__((packed)); | ||
480 | uint64_t raw; | 535 | uint64_t raw; |
481 | } channel_ctrl_t; | 536 | } channel_ctrl_gf100_t; |
537 | |||
538 | // TODO: Remove use of deprecated type name | ||
539 | typedef channel_ctrl_gf100_t channel_ctrl_t; | ||
482 | 540 | ||
483 | /* CHannel RAM (CHRAM) (PCCSR replacement on Ampere+) | 541 | /* CHannel RAM (CHRAM) (PCCSR replacement on Ampere+) |
484 | Starting with Ampere, channel IDs are no longer unique indexes into the | 542 | Starting with Ampere, channel IDs are no longer unique indexes into the |
@@ -543,6 +601,8 @@ typedef union { | |||
543 | Support: Fermi, Kepler, Maxwell, Pascal, Volta, Turing | 601 | Support: Fermi, Kepler, Maxwell, Pascal, Volta, Turing |
544 | */ | 602 | */ |
545 | #define NV_PFIFO_SCHED_DISABLE 0x00002630 | 603 | #define NV_PFIFO_SCHED_DISABLE 0x00002630 |
604 | // Support: Ampere | ||
605 | #define NV_RUNLIST_SCHED_DISABLE 0x094 | ||
546 | typedef union { | 606 | typedef union { |
547 | struct { | 607 | struct { |
548 | bool runlist_0:1; | 608 | bool runlist_0:1; |
@@ -1018,7 +1078,7 @@ typedef union { | |||
1018 | struct { | 1078 | struct { |
1019 | uint32_t ptr:28; | 1079 | uint32_t ptr:28; |
1020 | enum INST_TARGET target:2; | 1080 | enum INST_TARGET target:2; |
1021 | uint32_t :1; | 1081 | uint32_t :1; // disable_cya_debug for BAR2 |
1022 | bool is_virtual:1; | 1082 | bool is_virtual:1; |
1023 | } __attribute__((packed)); | 1083 | } __attribute__((packed)); |
1024 | uint32_t raw; | 1084 | uint32_t raw; |
@@ -1091,6 +1151,9 @@ typedef union { | |||
1091 | Support: Tesla 2.0* through Ampere, Ada | 1151 | Support: Tesla 2.0* through Ampere, Ada |
1092 | *FAULT_REPLAY_* fields are Pascal+ only | 1152 | *FAULT_REPLAY_* fields are Pascal+ only |
1093 | See also: dev_ram.h (open-gpu-kernel-modules) or dev_ram.ref.txt (open-gpu-doc) | 1153 | See also: dev_ram.h (open-gpu-kernel-modules) or dev_ram.ref.txt (open-gpu-doc) |
1154 | |||
1155 | It appears that on Hopper, IS_VER2 continues to mean IS_VER2, but if unset, the | ||
1156 | alternative is VER3. | ||
1094 | */ | 1157 | */ |
1095 | #define NV_PRAMIN_PDB_CONFIG_OFF 0x200 | 1158 | #define NV_PRAMIN_PDB_CONFIG_OFF 0x200 |
1096 | typedef union { | 1159 | typedef union { |
@@ -1101,7 +1164,7 @@ typedef union { | |||
1101 | bool fault_replay_tex:1; | 1164 | bool fault_replay_tex:1; |
1102 | bool fault_replay_gcc:1; | 1165 | bool fault_replay_gcc:1; |
1103 | uint32_t :4; | 1166 | uint32_t :4; |
1104 | bool is_ver2:1; | 1167 | bool is_ver2:1; // XXX: Not on Hopper. May be set or not for same page_dir. |
1105 | bool is_64k_big_page:1; // 128Kb otherwise | 1168 | bool is_64k_big_page:1; // 128Kb otherwise |
1106 | uint32_t page_dir_lo:20; | 1169 | uint32_t page_dir_lo:20; |
1107 | uint32_t page_dir_hi:32; | 1170 | uint32_t page_dir_hi:32; |
@@ -1421,6 +1484,182 @@ typedef union { | |||
1421 | } page_tbl_entry_v0_t; | 1484 | } page_tbl_entry_v0_t; |
1422 | */ | 1485 | */ |
1423 | 1486 | ||
1487 | /* Fifo Context RAM (RAMFC) and channel INstance RAM (RAMIN) | ||
1488 | |||
1489 | Each channel is configured with a 4 KiB instance block. The prefix of this | ||
1490 | block is referred to as RAMFC and stores channel-specific state for the Host | ||
1491 | (aka PFIFO). | ||
1492 | |||
1493 | "A GPU instance block is a block of memory that contains the state | ||
1494 | for a GPU context. A GPU context's instance block consists of Host state, | ||
1495 | pointers to each engine's state, and memory management state. A GPU instance | ||
1496 | block also contains a pointer to a block of memory that contains that part of a | ||
1497 | GPU context's state that a user-level driver may access. A GPU instance block | ||
1498 | fits within a single 4K-byte page of memory." | ||
1499 | |||
1500 | "The NV_RAMFC part of a GPU-instance block contains Host's part of a virtual | ||
1501 | GPU's state. Host is referred to as "FIFO". "FC" stands for FIFO Context. | ||
1502 | When Host switches from serving one GPU context to serving a second, Host saves | ||
1503 | state for the first GPU context to the first GPU context's RAMFC area, and loads | ||
1504 | state for the second GPU context from the second GPU context's RAMFC area." | ||
1505 | |||
1506 | "Every Host word entry in RAMFC directly corresponds to a PRI-accessible | ||
1507 | register. For a description of the contents of a RAMFC entry, please see the | ||
1508 | description of the corresponding register in "manuals/dev_pbdma.ref". The | ||
1509 | offsets of the fields within each entry in RAMFC match those of the | ||
1510 | corresponding register in the associated PBDMA unit's PRI space." | ||
1511 | |||
1512 | In summary, RAMFC includes details such as the head and tail of the pushbuffer, | ||
1513 | and RAMIN includes details such as the page table configuration(s). | ||
1514 | |||
1515 | The instance-global page table (as defined in the PDB field) is only used for | ||
1516 | GPU engines which do not support subcontexts (non-VEID engines). | ||
1517 | |||
1518 | **Not all documented fields are currently populated below.** | ||
1519 | |||
1520 | Support: *Kepler, *Maxwell, *Pascal, Volta, Turing, Ampere, [newer untested] | ||
1521 | *Pre-Volta GPUs do not support subcontexts. | ||
1522 | See also: dev_ram.ref.txt and dev_pbdma.ref.txt in NVIDIA's open-gpu-doc | ||
1523 | */ | ||
1524 | |||
1525 | // 16-byte (128-bit) substructure defining a subcontext configuration | ||
1526 | typedef struct { | ||
1527 | page_dir_config_t pdb; | ||
1528 | uint32_t pasid:20; // Process Address Space ID (PASID) used for ATS | ||
1529 | uint32_t :11; | ||
1530 | bool enable_ats:1; // Enable Address Translation Services (ATS)? | ||
1531 | uint32_t pad; | ||
1532 | } __attribute__((packed)) subcontext_ctrl_t; | ||
1533 | |||
1534 | typedef struct { | ||
1535 | // Start RAMFC (512 bytes) | ||
1536 | uint32_t pad[43]; | ||
1537 | uint32_t fc_target:5; // NV_RAMFC_TARGET; off 43 | ||
1538 | uint32_t :27; | ||
1539 | uint32_t pad2[17]; | ||
1540 | uint32_t fc_config_l2:1; // NV_RAMFC_CONFIG; off 61 | ||
1541 | uint32_t :3; | ||
1542 | uint32_t fc_config_ce_split:1; | ||
1543 | uint32_t fc_config_ce_no_throttle:1; | ||
1544 | uint32_t :2; | ||
1545 | uint32_t fc_config_is_priv:1; // ...AUTH_LEVEL | ||
1546 | uint32_t :3; | ||
1547 | uint32_t fc_config_userd_writeback:1; // ...USERD_WRITEBACK | ||
1548 | uint32_t :19; | ||
1549 | uint32_t pad3[1]; | ||
1550 | uint32_t fc_chan_info_scg:1; // ...SET_CHANNEL_INFO_SCG_TYPE | ||
1551 | uint32_t :7; | ||
1552 | uint32_t fc_chan_info_veid:6; // ...SET_CHANNEL_INFO_VEID | ||
1553 | uint32_t fc_chan_info_chid:12; // ...SET_CHANNEL_INFO_CHID | ||
1554 | uint32_t :6; | ||
1555 | uint32_t pad4[64]; | ||
1556 | // End RAMFC | ||
1557 | // Start RAMIN | ||
1558 | page_dir_config_t pdb; | ||
1559 | uint32_t pad5[2]; | ||
1560 | // WFI_TARGET appears to be ignored if WFI_IS_VIRTUAL | ||
1561 | uint32_t engine_wfi_target:2; // NV_RAMIN_ENGINE_WFI_TARGET; off 132 | ||
1562 | uint32_t engine_wfi_is_virtual:1; | ||
1563 | uint32_t :9; | ||
1564 | // WFI_PTR points to a CTXSW block (documented below) | ||
1565 | uint64_t engine_wfi_ptr:52; // NV_RAMIN_ENGINE_WFI_PTR_LO/_HI; off 132--133 | ||
1566 | uint32_t engine_wfi_veid:6; // NV_RAMIN_ENGINE_WFI_VEID; off 134; VEID == Subcontext ID | ||
1567 | uint32_t :26; | ||
1568 | uint32_t pasid:20; // NV_RAMIN_PASID; off 135; "Process Address Space ID" | ||
1569 | uint32_t :11; | ||
1570 | bool enable_ats:1; | ||
1571 | uint32_t pad6[30]; | ||
1572 | uint64_t subcontext_pdb_valid; // NV_RAMIN_SC_PDB_VALID; off 166-167 | ||
1573 | subcontext_ctrl_t subcontext[64]; // NV_RAMIN_SC_*; off 168-424 | ||
1574 | } __attribute__((packed)) instance_ctrl_t; | ||
1575 | |||
1576 | // Context types | ||
1577 | enum CTXSW_TYPE { | ||
1578 | CTXSW_UNDEFINED = 0x0, | ||
1579 | CTXSW_OPENGL = 0x8, | ||
1580 | CTXSW_DX9 = 0x10, | ||
1581 | CTXSW_DX10 = 0x11, | ||
1582 | CTXSW_DX11 = 0x12, | ||
1583 | CTXSW_COMPUTE = 0x20, | ||
1584 | CTXSW_HEADER = 0x21 // A per-subcontext header | ||
1585 | }; | ||
1586 | static inline const char *ctxsw_type_to_text(enum CTXSW_TYPE t) { | ||
1587 | switch (t) { | ||
1588 | case CTXSW_UNDEFINED: | ||
1589 | return "[None]"; | ||
1590 | case CTXSW_OPENGL: | ||
1591 | return "OpenGL"; | ||
1592 | case CTXSW_DX9: | ||
1593 | case CTXSW_DX10: | ||
1594 | case CTXSW_DX11: | ||
1595 | return "DirectX"; | ||
1596 | case CTXSW_COMPUTE: | ||
1597 | return "Compute"; | ||
1598 | case CTXSW_HEADER: | ||
1599 | return "Header"; | ||
1600 | default: | ||
1601 | return "UNKNOWN"; | ||
1602 | } | ||
1603 | } | ||
1604 | |||
1605 | // Preemption modes: | ||
1606 | // WFI: Wait For Idle (preempt on idle) | ||
1607 | // CTA: Cooperative Thread Array-level Preemption (preempt at end of block) | ||
1608 | // CILP: Compute-Instruction-Level Preemption (preempt at end of instruction) | ||
1609 | enum GRAPHICS_PREEMPT_TYPE {PREEMPT_WFI, PREEMPT_GFXP}; | ||
1610 | enum COMPUTE_PREEMPT_TYPE {_PREEMPT_WFI, PREEMPT_CTA, PREEMPT_CILP}; | ||
1611 | static inline const char *compute_preempt_type_to_text(enum COMPUTE_PREEMPT_TYPE t) { | ||
1612 | switch (t) { | ||
1613 | case PREEMPT_WFI: | ||
1614 | return "WFI"; | ||
1615 | case PREEMPT_CTA: | ||
1616 | return "CTA"; | ||
1617 | case PREEMPT_CILP: | ||
1618 | return "CILP"; | ||
1619 | default: | ||
1620 | return "INVALID"; | ||
1621 | } | ||
1622 | } | ||
1623 | static inline const char *graphics_preempt_type_to_text(enum COMPUTE_PREEMPT_TYPE t) { | ||
1624 | switch (t) { | ||
1625 | case PREEMPT_WFI: | ||
1626 | return "WFI"; | ||
1627 | case PREEMPT_GFXP: | ||
1628 | return "GFXP"; | ||
1629 | default: | ||
1630 | return "INVALID"; | ||
1631 | } | ||
1632 | } | ||
1633 | |||
1634 | /* ConTeXt SWitch control block (CTXSW) | ||
1635 | Support: Maxwell*, Pascal**, Volta, Turing, Ampere, Ada | ||
1636 | *Nothing except for CONTEXT_ID and TYPE | ||
1637 | **Except as noted | ||
1638 | See also: manuals/volta/gv100/dev_ctxsw.ref.txt in open-gpu-doc | ||
1639 | and hw_ctxsw_prog_*.h in nvgpu | ||
1640 | */ | ||
1641 | // (Note that this layout changes some generation-to-generation) | ||
1642 | typedef struct context_switch_block { | ||
1643 | uint32_t pad[3]; | ||
1644 | enum CTXSW_TYPE type:6; // Unused except when type CTXSW_HEADER? | ||
1645 | uint32_t :26; | ||
1646 | uint32_t pad2[26]; | ||
1647 | // The context buffer ptr fields are in an opposite-of-typical order, so we | ||
1648 | // can't merge them into a single context_buffer_ptr field. | ||
1649 | uint32_t context_buffer_ptr_hi; // Volta+ only | ||
1650 | uint32_t context_buffer_ptr_lo; // Volta+ only | ||
1651 | enum GRAPHICS_PREEMPT_TYPE graphics_preemption_options:32; | ||
1652 | enum COMPUTE_PREEMPT_TYPE compute_preemption_options:32; | ||
1653 | uint32_t pad3[18]; | ||
1654 | uint32_t num_wfi_save_operations; | ||
1655 | uint32_t num_cta_save_operations; | ||
1656 | uint32_t num_gfxp_save_operations; | ||
1657 | uint32_t num_cilp_save_operations; | ||
1658 | uint32_t pad4[4]; | ||
1659 | uint32_t context_id; | ||
1660 | // [There are more fields not yet added here.] | ||
1661 | } __attribute__((packed)) context_switch_ctrl_t; | ||
1662 | |||
1424 | /* VRAM Information | 1663 | /* VRAM Information |
1425 | 1664 | ||
1426 | If ECC is disabled: | 1665 | If ECC is disabled: |
@@ -1452,6 +1691,12 @@ static inline uint64_t memory_range_to_bytes(memory_range_t range) { | |||
1452 | 1691 | ||
1453 | /* Begin nvdebug types and functions */ | 1692 | /* Begin nvdebug types and functions */ |
1454 | 1693 | ||
1694 | // __iomem is only defined when building as a kernel module, so conditionally | ||
1695 | // define it to allow including this header outside the kernel. | ||
1696 | #ifndef __iomem | ||
1697 | #define __iomem | ||
1698 | #endif | ||
1699 | |||
1455 | // Vendor ID for PCI devices manufactured by NVIDIA | 1700 | // Vendor ID for PCI devices manufactured by NVIDIA |
1456 | #define NV_PCI_VENDOR 0x10de | 1701 | #define NV_PCI_VENDOR 0x10de |
1457 | struct nvdebug_state { | 1702 | struct nvdebug_state { |
@@ -1474,6 +1719,10 @@ struct nvdebug_state { | |||
1474 | struct platform_device *platd; | 1719 | struct platform_device *platd; |
1475 | // Pointer to generic device struct (both platform and pcie devices) | 1720 | // Pointer to generic device struct (both platform and pcie devices) |
1476 | struct device *dev; | 1721 | struct device *dev; |
1722 | #ifdef __KERNEL__ | ||
1723 | // List used by mmu.c to track allocated pages for page directories/tables | ||
1724 | struct list_head pd_allocs; | ||
1725 | #endif | ||
1477 | }; | 1726 | }; |
1478 | 1727 | ||
1479 | // This disgusting macro is a crutch to work around the fact that runlists were | 1728 | // This disgusting macro is a crutch to work around the fact that runlists were |
@@ -1542,7 +1791,19 @@ int get_runlist_iter( | |||
1542 | struct runlist_iter *rl_iter /* out */); | 1791 | struct runlist_iter *rl_iter /* out */); |
1543 | int preempt_tsg(struct nvdebug_state *g, uint32_t rl_id, uint32_t tsg_id); | 1792 | int preempt_tsg(struct nvdebug_state *g, uint32_t rl_id, uint32_t tsg_id); |
1544 | int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id); | 1793 | int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id); |
1545 | int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id); | 1794 | int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id, uint32_t off); |
1795 | instance_ctrl_t *instance_deref( | ||
1796 | struct nvdebug_state *g, | ||
1797 | uint64_t instance_addr, | ||
1798 | enum INST_TARGET instance_target); | ||
1799 | context_switch_ctrl_t *get_ctxsw( | ||
1800 | struct nvdebug_state *g, | ||
1801 | instance_ctrl_t *inst); | ||
1802 | int set_channel_preemption_mode( | ||
1803 | struct nvdebug_state *g, | ||
1804 | uint32_t chan_id, | ||
1805 | uint32_t rl_id, | ||
1806 | enum COMPUTE_PREEMPT_TYPE mode); | ||
1546 | 1807 | ||
1547 | // Defined in mmu.c | 1808 | // Defined in mmu.c |
1548 | uint64_t search_page_directory( | 1809 | uint64_t search_page_directory( |
@@ -1550,11 +1811,33 @@ uint64_t search_page_directory( | |||
1550 | page_dir_config_t pd_config, | 1811 | page_dir_config_t pd_config, |
1551 | uint64_t addr_to_find, | 1812 | uint64_t addr_to_find, |
1552 | enum INST_TARGET addr_to_find_aperture); | 1813 | enum INST_TARGET addr_to_find_aperture); |
1814 | int translate_page_directory( | ||
1815 | struct nvdebug_state *g, | ||
1816 | page_dir_config_t pd_config, | ||
1817 | uint64_t addr_to_find, | ||
1818 | uint64_t *found_addr /* out */, | ||
1819 | enum INST_TARGET *found_aperture /* out */); | ||
1820 | int map_page_directory( | ||
1821 | struct nvdebug_state *g, | ||
1822 | page_dir_config_t pd_config, | ||
1823 | uint64_t paddr_to_map, | ||
1824 | uint64_t vaddr_to_find, | ||
1825 | enum INST_TARGET paddr_target, | ||
1826 | bool huge_page); | ||
1827 | int gc_page_directory( | ||
1828 | struct nvdebug_state *g, | ||
1829 | bool force); | ||
1553 | uint64_t search_v1_page_directory( | 1830 | uint64_t search_v1_page_directory( |
1554 | struct nvdebug_state *g, | 1831 | struct nvdebug_state *g, |
1555 | page_dir_config_t pd_config, | 1832 | page_dir_config_t pd_config, |
1556 | uint64_t addr_to_find, | 1833 | uint64_t addr_to_find, |
1557 | enum INST_TARGET addr_to_find_aperture); | 1834 | enum INST_TARGET addr_to_find_aperture); |
1835 | int translate_v1_page_directory( | ||
1836 | struct nvdebug_state *g, | ||
1837 | page_dir_config_t pd_config, | ||
1838 | uint64_t addr_to_find, | ||
1839 | uint64_t *found_addr /* out */, | ||
1840 | enum INST_TARGET *found_aperture /* out */); | ||
1558 | // Defined in bus.c | 1841 | // Defined in bus.c |
1559 | int addr_to_pramin_mut(struct nvdebug_state *g, uint64_t addr, enum INST_TARGET target); | 1842 | int addr_to_pramin_mut(struct nvdebug_state *g, uint64_t addr, enum INST_TARGET target); |
1560 | int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd /* out */); | 1843 | int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd /* out */); |
diff --git a/nvdebug_entry.c b/nvdebug_entry.c index 3a10e13..c0cfa63 100644 --- a/nvdebug_entry.c +++ b/nvdebug_entry.c | |||
@@ -15,7 +15,7 @@ | |||
15 | 15 | ||
16 | // Enable to intercept and log GPU interrupts. Historically used to benchmark | 16 | // Enable to intercept and log GPU interrupts. Historically used to benchmark |
17 | // interrupt latency. | 17 | // interrupt latency. |
18 | #define INTERRUPT_DEBUG 0 | 18 | #define INTERRUPT_DEBUG |
19 | 19 | ||
20 | // MIT is GPL-compatible. We need to be GPL-compatible for symbols like | 20 | // MIT is GPL-compatible. We need to be GPL-compatible for symbols like |
21 | // platform_bus_type or bus_find_device_by_name... | 21 | // platform_bus_type or bus_find_device_by_name... |
@@ -28,12 +28,20 @@ extern struct file_operations runlist_file_ops; | |||
28 | extern struct file_operations preempt_tsg_file_ops; | 28 | extern struct file_operations preempt_tsg_file_ops; |
29 | extern struct file_operations disable_channel_file_ops; | 29 | extern struct file_operations disable_channel_file_ops; |
30 | extern struct file_operations enable_channel_file_ops; | 30 | extern struct file_operations enable_channel_file_ops; |
31 | extern struct file_operations wfi_preempt_channel_file_ops; | ||
32 | extern struct file_operations cta_preempt_channel_file_ops; | ||
33 | extern struct file_operations cil_preempt_channel_file_ops; | ||
31 | extern struct file_operations resubmit_runlist_file_ops; | 34 | extern struct file_operations resubmit_runlist_file_ops; |
35 | extern struct file_operations preempt_runlist_file_ops; | ||
36 | extern struct file_operations ack_bad_tsg_file_ops; | ||
37 | extern struct file_operations map_mem_chid_file_ops; | ||
38 | extern struct file_operations map_mem_ctxid_file_ops; | ||
32 | extern struct file_operations switch_to_tsg_file_ops; | 39 | extern struct file_operations switch_to_tsg_file_ops; |
33 | // device_info_procfs.c | 40 | // device_info_procfs.c |
34 | extern struct file_operations device_info_file_ops; | 41 | extern struct file_operations device_info_file_ops; |
35 | extern struct file_operations nvdebug_read_reg32_file_ops; | 42 | extern struct file_operations nvdebug_read_reg32_file_ops; |
36 | extern struct file_operations nvdebug_read_reg_range_file_ops; | 43 | extern struct file_operations nvdebug_read_reg_range_file_ops; |
44 | extern struct file_operations nvdebug_read_part_file_ops; | ||
37 | extern struct file_operations local_memory_file_ops; | 45 | extern struct file_operations local_memory_file_ops; |
38 | // copy_topology_procfs.c | 46 | // copy_topology_procfs.c |
39 | extern struct file_operations copy_topology_file_ops; | 47 | extern struct file_operations copy_topology_file_ops; |
@@ -71,9 +79,271 @@ const struct file_operations* compat_ops(const struct file_operations* ops) { | |||
71 | } | 79 | } |
72 | #endif | 80 | #endif |
73 | 81 | ||
74 | #if INTERRUPT_DEBUG | 82 | #ifdef INTERRUPT_DEBUG |
83 | |||
84 | void nvdebug_fifo_intr(struct nvdebug_state *g) { | ||
85 | uint32_t fifo_intr_mask;// = nvdebug_readl(g, 0x02100); // PFIFO_INTR_0 | ||
86 | fifo_intr_mask = nvdebug_readl(g, 0x02100); // PFIFO_INTR_0 | ||
87 | if (fifo_intr_mask & 1 << 0) | ||
88 | printk(KERN_INFO "[nvdebug] - Interrupt BIND_ERROR.\n"); | ||
89 | if (fifo_intr_mask & 1 << 1) | ||
90 | printk(KERN_INFO "[nvdebug] - Interrupt CTXSW_TIMEOUT.\n"); | ||
91 | if (fifo_intr_mask & 1 << 4) | ||
92 | printk(KERN_INFO "[nvdebug] - Interrupt RUNLIST_IDLE.\n"); | ||
93 | if (fifo_intr_mask & 1 << 5) | ||
94 | printk(KERN_INFO "[nvdebug] - Interrupt RUNLIST_AND_ENG_IDLE.\n"); | ||
95 | if (fifo_intr_mask & 1 << 6) | ||
96 | printk(KERN_INFO "[nvdebug] - Interrupt RUNLIST_ACQUIRE.\n"); | ||
97 | if (fifo_intr_mask & 1 << 7) | ||
98 | printk(KERN_INFO "[nvdebug] - Interrupt RUNLIST_ACQUIRE_AND_ENG_IDLE.\n"); | ||
99 | if (fifo_intr_mask & 1 << 8) | ||
100 | printk(KERN_INFO "[nvdebug] - Interrupt SCHED_ERROR.\n"); | ||
101 | if (fifo_intr_mask & 1 << 16) | ||
102 | printk(KERN_INFO "[nvdebug] - Interrupt CHSW_ERROR.\n"); | ||
103 | if (fifo_intr_mask & 1 << 23) | ||
104 | printk(KERN_INFO "[nvdebug] - Interrupt MEMOP_TIMEOUT.\n"); | ||
105 | if (fifo_intr_mask & 1 << 24) | ||
106 | printk(KERN_INFO "[nvdebug] - Interrupt LB_ERROR.\n"); | ||
107 | if (fifo_intr_mask & 1 << 25) // OLD; Pascal | ||
108 | printk(KERN_INFO "[nvdebug] - Interrupt REPLAYABLE_FAULT_ERROR.\n"); | ||
109 | if (fifo_intr_mask & 1 << 27) // OLD; Pascal | ||
110 | printk(KERN_INFO "[nvdebug] - Interrupt DROPPED_MMU_FAULT.\n"); | ||
111 | if (fifo_intr_mask & 1 << 28) { // On Pascal, this is MMU_FAULT | ||
112 | if (g->chip_id <= NV_CHIP_ID_VOLTA) // MMU_FAULT on Pascal (nvgpu, l4t/l4t-r28.1:drivers/gpu/nvgpu/include/nvgpu/hw/gp10b/hw_fifo_gp10b.h) | ||
113 | printk(KERN_INFO "[nvdebug] - Interrupt MMU_FAULT.\n"); | ||
114 | else // Repurposed starting with Turing: open-gpu-doc/manuals/turing/tu104/dev_fifo.ref.txt | ||
115 | printk(KERN_INFO "[nvdebug] - Interrupt TSG_PREEMPT_COMPLETE.\n"); | ||
116 | } | ||
117 | if (fifo_intr_mask & 1 << 29) | ||
118 | printk(KERN_INFO "[nvdebug] - Interrupt PBDMA_INTR.\n"); | ||
119 | if (fifo_intr_mask & 1 << 30) { | ||
120 | printk(KERN_INFO "[nvdebug] - Interrupt RUNLIST_EVENT.\n"); | ||
121 | uint32_t fifo_runlist_intr_mask = nvdebug_readl(g, 0x02A00); // PFIFO_INTR_RUNLIST | ||
122 | printk(KERN_INFO "[nvdebug] - Event %#x.\n", fifo_runlist_intr_mask); | ||
123 | } | ||
124 | if (fifo_intr_mask & 1 << 31) | ||
125 | printk(KERN_INFO "[nvdebug] - Interrupt CHANNEL_INTR.\n"); | ||
126 | } | ||
127 | |||
75 | irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) { | 128 | irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) { |
76 | printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num); | 129 | struct nvdebug_state *g = dev; |
130 | u64 time = ktime_get_raw_ns(); // CLOCK_MONOTONTIC_RAW | ||
131 | // NV_PMC_INTR does not exist on Ada, so use NV_FUNC_PRIV_CPU_INTR_TOP | ||
132 | // Note that this also appears to exist on Turing | ||
133 | if (g->chip_id >= NV_CHIP_ID_TURING) {//AMPERE) { | ||
134 | int i; | ||
135 | // Despite being an indexed register, it is only documented to have on, and could only support two | ||
136 | uint32_t intr_mask0 = nvdebug_readl(g, NV_VIRTUAL_FUNCTION_FULL_PHYS_OFFSET + 0x1600); // NV_FUNC_PRIV_CPU_INTR_TOP(0) | ||
137 | uint32_t intr_mask1 = nvdebug_readl(g, NV_VIRTUAL_FUNCTION_FULL_PHYS_OFFSET + 0x1604); // NV_FUNC_PRIV_CPU_INTR_TOP(1) | ||
138 | printk(KERN_INFO "[nvdebug] Interrupt on IRQ %d with CPU_INTR_TOP(0) %#010x, ...(1) %#010x @ %llu.\n", irq_num, intr_mask0, intr_mask1, time); | ||
139 | for (i = 0; i < 8; i++) { | ||
140 | uint32_t leaf = nvdebug_readl(g, NV_VIRTUAL_FUNCTION_FULL_PHYS_OFFSET + 0x1000 + i*4); // NV_FUNC_PRIV_CPU_INTR_LEAF(0) to ...(7) | ||
141 | if (leaf) | ||
142 | printk(KERN_INFO "[nvdebug] - Interrupt leaf %d: %#010x\n", i, leaf); | ||
143 | // 131-133 & 64 are faults on tu104??? (open-gpu-doc/manuals/turing/tu104/pri_mmu_hub.ref.txt) | ||
144 | if (136 / 32 == i && 1 << (136 % 32) & leaf) // PFIFO0 ga100 | ||
145 | printk(KERN_INFO "[nvdebug] - Interrupt on PFIFO0.\n"); | ||
146 | if (137 / 32 == i && 1 << (137 % 32) & leaf) // PFIFO1 ga100 | ||
147 | printk(KERN_INFO "[nvdebug] - Interrupt on PFIFO1.\n"); | ||
148 | if (148 / 32 == i && 1 << (148 % 32) & leaf) // TIMER ga100 | ||
149 | printk(KERN_INFO "[nvdebug] - Interrupt on PTIMER.\n"); | ||
150 | if (152 / 32 == i && 1 << (152 % 32) & leaf) // PMU ga100 | ||
151 | printk(KERN_INFO "[nvdebug] - Interrupt on PMU.\n"); | ||
152 | if (156 / 32 == i && 1 << (156 % 32) & leaf) { // PBUS ga100 | ||
153 | printk(KERN_INFO "[nvdebug] - Interrupt on PBUS.\n"); | ||
154 | uint32_t bus_intr = nvdebug_readl(g, 0x1100); // BUS_INTR_0 | ||
155 | if (bus_intr & 1 << 2) { | ||
156 | // use timer_pri_timeout_save_0_r | ||
157 | uint32_t SAVE_0 = nvdebug_readl(g, 0x00009084); // NV_PTIMER_PRI_TIMEOUT_SAVE_0 | ||
158 | printk(KERN_INFO "[nvdebug] - Interrupt PRI_FECSERR on %s to address %#010x %stargeting FECS.\n", SAVE_0 & 0x2 ? "write" : "read", SAVE_0 & 0x00fffffc, SAVE_0 & 0x80000000 ? "" : "not "); | ||
159 | uint32_t SAVE_1 = nvdebug_readl(g, 0x00009088); // NV_PTIMER_PRI_TIMEOUT_SAVE_1 | ||
160 | if (SAVE_1) | ||
161 | printk(KERN_INFO "[nvdebug] Data written: %#010x\n", SAVE_1); | ||
162 | uint32_t errcode = readl(g->regs + 0x0000908C); // NV_PTIMER_PRI_TIMEOUT_FECS_ERRCODE | ||
163 | if (errcode) | ||
164 | printk(KERN_INFO "[nvdebug] FECS Error Code: %#010x\n", errcode); | ||
165 | // badf5040 is a "client error" (0) of "no such address" (40) | ||
166 | // See linux-nvgpu/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_ga10b_fusa.c | ||
167 | // for how to decode. | ||
168 | } | ||
169 | if (bus_intr & 1 << 3) | ||
170 | printk(KERN_INFO "[nvdebug] - Interrupt PRI_TIMEOUT.\n"); | ||
171 | if (bus_intr & 1 << 4) | ||
172 | printk(KERN_INFO "[nvdebug] - Interrupt FB_REQ_TIMEOUT.\n"); | ||
173 | if (bus_intr & 1 << 5) | ||
174 | printk(KERN_INFO "[nvdebug] - Interrupt FB_ACK_TIMEOUT.\n"); | ||
175 | if (bus_intr & 1 << 6) | ||
176 | printk(KERN_INFO "[nvdebug] - Interrupt FB_ACK_EXTRA.\n"); | ||
177 | if (bus_intr & 1 << 7) | ||
178 | printk(KERN_INFO "[nvdebug] - Interrupt FB_RDATA_TIMEOUT.\n"); | ||
179 | if (bus_intr & 1 << 8) | ||
180 | printk(KERN_INFO "[nvdebug] - Interrupt FB_RDATA_EXTRA.\n"); | ||
181 | if (bus_intr & 1 << 26) | ||
182 | printk(KERN_INFO "[nvdebug] - Interrupt SW.\n"); | ||
183 | if (bus_intr & 1 << 27) | ||
184 | printk(KERN_INFO "[nvdebug] - Interrupt POSTED_DEADLOCK_TIMEOUT.\n"); | ||
185 | if (bus_intr & 1 << 28) | ||
186 | printk(KERN_INFO "[nvdebug] - Interrupt MPMU.\n"); | ||
187 | if (bus_intr & 1 << 31) | ||
188 | printk(KERN_INFO "[nvdebug] - Interrupt ACCESS_TIMEOUT.\n"); | ||
189 | } | ||
190 | if (158 / 32 == i && 1 << (158 % 32) & leaf) // PRIV_RING ga100 | ||
191 | printk(KERN_INFO "[nvdebug] - Interrupt on PRIV_RING.\n"); | ||
192 | if (192 / 32 == i && 1 << (192 % 32) & leaf) // LEGACY_ENGINE_STALL ga100 | ||
193 | printk(KERN_INFO "[nvdebug] - Interrupt on LEGACY_ENGINE_STALL.\n"); | ||
194 | if (160 / 32 == i && 1 << (160 % 32) & leaf) { // (likely) rl0 ga100 | ||
195 | printk(KERN_INFO "[nvdebug] - Interrupt on RUNLIST0.\n"); | ||
196 | uint32_t off; | ||
197 | get_runlist_ram(g, 0, &off); | ||
198 | uint32_t rl_intr = nvdebug_readl(g, off+0x100); | ||
199 | printk(KERN_INFO "[nvdebug] - RUNLIST_INTR_0: %#x\n", rl_intr); | ||
200 | if (1 << 12 && rl_intr) { // BAD_TSG | ||
201 | printk(KERN_INFO "[nvdebug] - BAD_TSG: %#x\n", nvdebug_readl(g, off+0x174)); | ||
202 | } | ||
203 | } | ||
204 | // Also getting 160, 161, and 162 | ||
205 | |||
206 | //uint32_t off; | ||
207 | //get_runlist_ram(g, 12, &off); | ||
208 | //printk(KERN_INFO "[nvdebug] - rl10 vector id 0 is %x\n", nvdebug_readl(g, off+0x160)); // NV_RUNLIST_INTR_VECTORID(0) | ||
209 | // 160 is rl0 (C/G, LCE0, LCE1) vector id 0 | ||
210 | // 168 is rl11 (LCE3) vector id 0 | ||
211 | // 169 is rl12 (LCE4) vector id 0 | ||
212 | // 171 is rl1 (SEC) vector id 0 | ||
213 | // 176 is rl10 (LCE2) vector id 0 | ||
214 | // 224 is rl0 vector id 1 | ||
215 | // Only some interrupt vectors are hardcoded | ||
216 | } | ||
217 | // each subtree has two leafs? Each bit at the top corresponds to a subtree? | ||
218 | // So, if bit 0 is set, that means subtree 0 (concept) and leaves 0 and 1 | ||
219 | // So, if bit 1 is set, that means subtree 1 (concept) and leaves 2 and 3 | ||
220 | // the #define'd interrupt vectors all seem to fall in the lower leaf of subtree 2, | ||
221 | // except for INTR_HUB_ACCESS_CNTR_INTR_VECTOR is in the lower leaf of subtree 1 | ||
222 | if (g->chip_id >= NV_CHIP_ID_AMPERE) | ||
223 | return IRQ_NONE; | ||
224 | } | ||
225 | uint32_t intr_mask = nvdebug_readl(g, 0x0100); // NV_PMC_INTR | ||
226 | printk(KERN_INFO "[nvdebug] Interrupt on IRQ %d with MC_INTR %#010x @ %llu.\n", irq_num, intr_mask, time); | ||
227 | // IDs likely changed Ampere+ | ||
228 | //if (g->chip_id >= NV_CHIP_ID_AMPERE) { | ||
229 | // CIC is central interrupt controller | ||
230 | // the u32 passed around nvgpu cic functions is one of the | ||
231 | // enable is nvgpu_cic_mon_intr_stall_unit_config(unit) | ||
232 | // - Calls intr_stall_unit_config(unit) | ||
233 | // - for ga, calls unit = ga10b_intr_map_mc_stall_unit_to_intr_unit(unit) (doesn't do much) | ||
234 | // - for ga, calls nvgpu_cic_mon_intr_get_unit_info() | ||
235 | // - Does *subtree = g->mc.intr_unit_info[unit].subtree; | ||
236 | // *subtree_mask = g->mc.intr_unit_info[unit].subtree_mask; | ||
237 | // - for ga, calls ga10b_intr_config() w/ subtree info | ||
238 | //uint32_t intr_stats = nvdebug_readl(g, 1600 | ||
239 | //return IRQ_NONE; | ||
240 | //} | ||
241 | if (intr_mask & 1 << 5) | ||
242 | printk(KERN_INFO "[nvdebug] - Interrupt on LCE0.\n"); | ||
243 | if (intr_mask & 1 << 6) | ||
244 | printk(KERN_INFO "[nvdebug] - Interrupt on LCE1.\n"); | ||
245 | if (intr_mask & 1 << 7) | ||
246 | printk(KERN_INFO "[nvdebug] - Interrupt on LCE2.\n"); | ||
247 | if (intr_mask & 1 << 8) { | ||
248 | printk(KERN_INFO "[nvdebug] - Interrupt on PFIFO.\n"); | ||
249 | nvdebug_fifo_intr(g); | ||
250 | } | ||
251 | if (intr_mask & 1 << 9) { | ||
252 | printk(KERN_INFO "[nvdebug] - Interrupt on HUB.\n"); // "replayable_fault_pending" in nvgpu on Pascal, "HUB" on Volta+ | ||
253 | // on tu104, if vector is one of the below set in new-style interrupt vector, then MMU fault | ||
254 | // - info_fault (134) | ||
255 | // - nonreplay_fault error (133) | ||
256 | // - nonreplay_fault notify (132) | ||
257 | // - replay_fault error (131) | ||
258 | // - replay_fault notify (64) | ||
259 | // (but the above fault vectors are configurable) | ||
260 | // if it's ecc_error, then not mmu error | ||
261 | // Default fault vectors from open-gpu-doc/manuals/turing/tu104/pri_mmu_hub.ref.txt | ||
262 | // Turing through (at least) Ampere (per nvgpu) | ||
263 | |||
264 | // on gv100, parse fb_niso_intr_r 0x00100a20U, where bits: | ||
265 | // - hub_access_counter notify (0) | ||
266 | // - hub_access_counter error (1) | ||
267 | // - replay_fault notify (27) | ||
268 | // - replay_fault overflow (28) | ||
269 | // - nonreplay_fault notify (29) | ||
270 | // - nonreplay_fault overflow (30) | ||
271 | // - other_fault notify (31) | ||
272 | // Volta through Turing (per nvgpu) | ||
273 | |||
274 | // On Pascal, it looks like it's a property of fifo_intr_0??? | ||
275 | if (g->chip_id < NV_CHIP_ID_VOLTA) | ||
276 | nvdebug_fifo_intr(g); | ||
277 | } | ||
278 | if (intr_mask & 1 << 10) | ||
279 | printk(KERN_INFO "[nvdebug] - Interrupt on LCE3.\n"); | ||
280 | if (intr_mask & 1 << 11) | ||
281 | printk(KERN_INFO "[nvdebug] - Interrupt on LCE4.\n"); | ||
282 | if (intr_mask & 1 << 12) { | ||
283 | printk(KERN_INFO "[nvdebug] - Interrupt on Graphics/Compute.\n"); | ||
284 | // Kepler through (at least) Ampere | ||
285 | // From open-gpu-doc/manuals/volta/gv100/dev_graphics.ref.txt | ||
286 | uint32_t graph_intr_mask = nvdebug_readl(g, 0x400100); // NV_PGRAPH_INTR | ||
287 | if (graph_intr_mask & 1 << 0) | ||
288 | printk(KERN_INFO "[nvdebug] - Interrupt NOTIFY.\n"); | ||
289 | if (graph_intr_mask & 1 << 1) | ||
290 | printk(KERN_INFO "[nvdebug] - Interrupt SEMAPHORE.\n"); | ||
291 | if (graph_intr_mask & 1 << 4) | ||
292 | printk(KERN_INFO "[nvdebug] - Interrupt ILLEGAL_METHOD.\n"); | ||
293 | if (graph_intr_mask & 1 << 5) | ||
294 | printk(KERN_INFO "[nvdebug] - Interrupt ILLEGAL_CLASS.\n"); | ||
295 | if (graph_intr_mask & 1 << 6) | ||
296 | printk(KERN_INFO "[nvdebug] - Interrupt ILLEGAL_NOTIFY.\n"); | ||
297 | if (graph_intr_mask & 1 << 7) | ||
298 | printk(KERN_INFO "[nvdebug] - Interrupt DEBUG_METHOD.\n"); | ||
299 | if (graph_intr_mask & 1 << 8) | ||
300 | printk(KERN_INFO "[nvdebug] - Interrupt FIRMWARE_METHOD.\n"); | ||
301 | if (graph_intr_mask & 1 << 16) | ||
302 | printk(KERN_INFO "[nvdebug] - Interrupt BUFFER_NOTIFY.\n"); | ||
303 | if (graph_intr_mask & 1 << 19) | ||
304 | printk(KERN_INFO "[nvdebug] - Interrupt FECS_ERROR.\n"); | ||
305 | if (graph_intr_mask & 1 << 20) | ||
306 | printk(KERN_INFO "[nvdebug] - Interrupt CLASS_ERROR.\n"); | ||
307 | if (graph_intr_mask & 1 << 21) | ||
308 | printk(KERN_INFO "[nvdebug] - Interrupt EXCEPTION.\n"); | ||
309 | } | ||
310 | if (intr_mask & 1 << 13) | ||
311 | printk(KERN_INFO "[nvdebug] - Interrupt on PFB.\n"); | ||
312 | if (intr_mask & 1 << 15) | ||
313 | printk(KERN_INFO "[nvdebug] - Interrupt on SEC.\n"); | ||
314 | if (intr_mask & 1 << 16) | ||
315 | printk(KERN_INFO "[nvdebug] - Interrupt on NVENC0.\n"); | ||
316 | if (intr_mask & 1 << 17) | ||
317 | printk(KERN_INFO "[nvdebug] - Interrupt on NVDEC0.\n"); | ||
318 | if (intr_mask & 1 << 18) | ||
319 | printk(KERN_INFO "[nvdebug] - Interrupt on THERMAL.\n"); | ||
320 | if (intr_mask & 1 << 19) | ||
321 | printk(KERN_INFO "[nvdebug] - Interrupt on HDACODEC.\n"); | ||
322 | if (intr_mask & 1 << 20) | ||
323 | printk(KERN_INFO "[nvdebug] - Interrupt on PTIMER.\n"); | ||
324 | if (intr_mask & 1 << 21) | ||
325 | printk(KERN_INFO "[nvdebug] - Interrupt on PMGR.\n"); | ||
326 | if (intr_mask & 1 << 22) | ||
327 | printk(KERN_INFO "[nvdebug] - Interrupt on IOCTRL.\n"); | ||
328 | if (intr_mask & 1 << 23) | ||
329 | printk(KERN_INFO "[nvdebug] - Interrupt on DFD.\n"); | ||
330 | if (intr_mask & 1 << 24) | ||
331 | printk(KERN_INFO "[nvdebug] - Interrupt on PMU.\n"); | ||
332 | if (intr_mask & 1 << 25) | ||
333 | printk(KERN_INFO "[nvdebug] - Interrupt on LTC.\n"); | ||
334 | if (intr_mask & 1 << 26) | ||
335 | printk(KERN_INFO "[nvdebug] - Interrupt on PDISP.\n"); | ||
336 | if (intr_mask & 1 << 27) | ||
337 | printk(KERN_INFO "[nvdebug] - Interrupt on GSP.\n"); | ||
338 | if (intr_mask & 1 << 28) | ||
339 | printk(KERN_INFO "[nvdebug] - Interrupt on PBUS.\n"); | ||
340 | if (intr_mask & 1 << 29) | ||
341 | printk(KERN_INFO "[nvdebug] - Interrupt on XVE.\n"); | ||
342 | if (intr_mask & 1 << 30) | ||
343 | printk(KERN_INFO "[nvdebug] - Interrupt on PRIV_RING.\n"); | ||
344 | if (intr_mask & 1 << 30) | ||
345 | printk(KERN_INFO "[nvdebug] - Interrupt on SOFTWARE.\n"); | ||
346 | |||
77 | return IRQ_NONE; // We don't actually handle any interrupts. Pass them on. | 347 | return IRQ_NONE; // We don't actually handle any interrupts. Pass them on. |
78 | } | 348 | } |
79 | #endif // INTERRUPT_DEBUG | 349 | #endif // INTERRUPT_DEBUG |
@@ -135,6 +405,7 @@ int probe_and_cache_devices(void) { | |||
135 | g_nvdebug_state[i].pcid = NULL; | 405 | g_nvdebug_state[i].pcid = NULL; |
136 | g_nvdebug_state[i].platd = platd; | 406 | g_nvdebug_state[i].platd = platd; |
137 | g_nvdebug_state[i].dev = dev; | 407 | g_nvdebug_state[i].dev = dev; |
408 | INIT_LIST_HEAD(&g_nvdebug_state[i].pd_allocs); | ||
138 | // Don't check Chip ID until everything else is initalized | 409 | // Don't check Chip ID until everything else is initalized |
139 | ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0); | 410 | ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0); |
140 | if (ids.raw == -1) { | 411 | if (ids.raw == -1) { |
@@ -152,6 +423,11 @@ int probe_and_cache_devices(void) { | |||
152 | mc_boot_0_t ids; | 423 | mc_boot_0_t ids; |
153 | g_nvdebug_state[i].g = NULL; | 424 | g_nvdebug_state[i].g = NULL; |
154 | // Map BAR0 (GPU control registers) | 425 | // Map BAR0 (GPU control registers) |
426 | // XXX: Don't use pci_iomap. This adds support for I/O registers, but we do | ||
427 | // not use the required ioread/write functions for those regions. We | ||
428 | // should use pci_ioremap_bar, which is explictly for MMIO regions. | ||
429 | // pci_ioremap_bar -> ioremap_nocache (all platforms) | ||
430 | // pci_iomap -> ioremap_nocache (on x86) | ||
155 | g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0); | 431 | g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0); |
156 | if (!g_nvdebug_state[i].regs) { | 432 | if (!g_nvdebug_state[i].regs) { |
157 | pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n"); | 433 | pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n"); |
@@ -163,9 +439,14 @@ int probe_and_cache_devices(void) { | |||
163 | // (vesafb may map the top half for display) | 439 | // (vesafb may map the top half for display) |
164 | if (!g_nvdebug_state[i].bar3) | 440 | if (!g_nvdebug_state[i].bar3) |
165 | g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2); | 441 | g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2); |
442 | // Observed on H100, BAR2, moved it BAR3, was moved to BAR4, and BAR1 | ||
443 | // was moved to BAR2. | ||
444 | if (!g_nvdebug_state[i].bar3) | ||
445 | g_nvdebug_state[i].bar3 = pci_iomap(pcid, 4, 0); | ||
166 | g_nvdebug_state[i].pcid = pcid; | 446 | g_nvdebug_state[i].pcid = pcid; |
167 | g_nvdebug_state[i].platd = NULL; | 447 | g_nvdebug_state[i].platd = NULL; |
168 | g_nvdebug_state[i].dev = &pcid->dev; | 448 | g_nvdebug_state[i].dev = &pcid->dev; |
449 | INIT_LIST_HEAD(&g_nvdebug_state[i].pd_allocs); | ||
169 | // Don't check Chip ID until everything else is initalized | 450 | // Don't check Chip ID until everything else is initalized |
170 | ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0); | 451 | ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0); |
171 | if (ids.raw == -1) { | 452 | if (ids.raw == -1) { |
@@ -175,9 +456,17 @@ int probe_and_cache_devices(void) { | |||
175 | g_nvdebug_state[i].chip_id = ids.chip_id; | 456 | g_nvdebug_state[i].chip_id = ids.chip_id; |
176 | printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.", | 457 | printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.", |
177 | ids.chip_id, ARCH2NAME(ids.architecture)); | 458 | ids.chip_id, ARCH2NAME(ids.architecture)); |
178 | #if INTERRUPT_DEBUG | 459 | #ifdef INTERRUPT_DEBUG |
179 | if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) { | 460 | // For this to work, you must also add IRQF_SHARED to the flags |
180 | printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n"); | 461 | // argument of the request_threaded_irq() call in the nvidia driver |
462 | // (file /usr/src/nvidia.../nvidia/nv.c and nv-msi.c with dkms) | ||
463 | // Then run: | ||
464 | // sudo dkms remove nvidia-srv/VER -k $(uname -r) | ||
465 | // sudo dkms install nvidia-srv/VER -k $(uname -r) --force | ||
466 | // where VER is the version of the nvidia module (eg. 535.216.03) | ||
467 | int err; | ||
468 | if ((err = request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", &g_nvdebug_state[i]))) { | ||
469 | printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap, error %d\n", err); | ||
181 | } | 470 | } |
182 | #endif // INTERRUPT_DEBUG | 471 | #endif // INTERRUPT_DEBUG |
183 | i++; | 472 | i++; |
@@ -335,6 +624,40 @@ int __init nvdebug_init(void) { | |||
335 | "enable_channel", 0222, chram_scope, compat_ops(&enable_channel_file_ops), | 624 | "enable_channel", 0222, chram_scope, compat_ops(&enable_channel_file_ops), |
336 | (void*)last_runlist)) | 625 | (void*)last_runlist)) |
337 | goto out_nomem; | 626 | goto out_nomem; |
627 | // Create file `/proc/gpu#/runlist#/wfi_preempt_channel`, world writable | ||
628 | // On Turing and older, `/proc/gpu#/wfi_preempt_channel` | ||
629 | if (!proc_create_data( | ||
630 | "wfi_preempt_channel", 0222, chram_scope, compat_ops(&wfi_preempt_channel_file_ops), | ||
631 | (void*)last_runlist)) | ||
632 | goto out_nomem; | ||
633 | // Create file `/proc/gpu#/runlist#/cta_preempt_channel`, world writable | ||
634 | // On Turing and older, `/proc/gpu#/cta_preempt_channel` | ||
635 | if (!proc_create_data( | ||
636 | "cta_preempt_channel", 0222, chram_scope, compat_ops(&cta_preempt_channel_file_ops), | ||
637 | (void*)last_runlist)) | ||
638 | goto out_nomem; | ||
639 | // Compute-instruction-level (CIL) preemption is only available on Pascal+ | ||
640 | if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) { | ||
641 | // Create file `/proc/gpu#/runlist#/cil_preempt_channel`, world writable | ||
642 | // On Turing and older, `/proc/gpu#/cil_preempt_channel` | ||
643 | if (!proc_create_data( | ||
644 | "cil_preempt_channel", 0222, chram_scope, compat_ops(&cil_preempt_channel_file_ops), | ||
645 | (void*)last_runlist)) | ||
646 | goto out_nomem; | ||
647 | } | ||
648 | // Create files which enable on-GPU scheduling (Pascal+) | ||
649 | if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) { | ||
650 | // Create file `/proc/gpu#/map_mem_chid`, root writable | ||
651 | if (!proc_create_data( | ||
652 | "map_mem_chid", 0200, chram_scope, compat_ops(&map_mem_chid_file_ops), | ||
653 | (void*)last_runlist)) | ||
654 | goto out_nomem; | ||
655 | // Create file `/proc/gpu#/map_mem_ctxid`, root writable | ||
656 | if (!proc_create_data( | ||
657 | "map_mem_ctxid", 0222, rl_dir, compat_ops(&map_mem_ctxid_file_ops), | ||
658 | (void*)last_runlist)) | ||
659 | goto out_nomem; | ||
660 | } | ||
338 | } | 661 | } |
339 | // Create file `/proc/gpu#/runlist#/runlist`, world readable | 662 | // Create file `/proc/gpu#/runlist#/runlist`, world readable |
340 | if (!proc_create_data( | 663 | if (!proc_create_data( |
@@ -346,16 +669,26 @@ int __init nvdebug_init(void) { | |||
346 | "switch_to_tsg", 0222, rl_dir, compat_ops(&switch_to_tsg_file_ops), | 669 | "switch_to_tsg", 0222, rl_dir, compat_ops(&switch_to_tsg_file_ops), |
347 | (void*)last_runlist)) | 670 | (void*)last_runlist)) |
348 | goto out_nomem; | 671 | goto out_nomem; |
672 | /* On the TU104, the context scheduler (contained in the Host, aka | ||
673 | * PFIFO, unit) has been observed to sometimes to fail to schedule TSGs | ||
674 | * containing re-enabled channels. Resubmitting the runlist | ||
675 | * configuration appears to remediate this condition, and so this API | ||
676 | * is exposed to help reset GPU scheduling as necessary. | ||
677 | */ | ||
678 | // Create file `/proc/gpu#/resubmit_runlist`, world writable | ||
679 | if (!proc_create_data( | ||
680 | "resubmit_runlist", 0222, rl_dir, compat_ops(&resubmit_runlist_file_ops), | ||
681 | (void*)device_id)) | ||
682 | goto out_nomem; | ||
349 | } while (last_runlist-- > 0); | 683 | } while (last_runlist-- > 0); |
350 | /* On the TU104, the context scheduler (contained in the Host, aka | 684 | // Create file `/proc/gpu#/preempt_runlist`, world writable |
351 | * PFIFO, unit) has been observed to sometimes to fail to schedule TSGs | ||
352 | * containing re-enabled channels. Resubmitting the runlist | ||
353 | * configuration appears to remediate this condition, and so this API | ||
354 | * is exposed to help reset GPU scheduling as necessary. | ||
355 | */ | ||
356 | // Create file `/proc/gpu#/resubmit_runlist`, world writable | ||
357 | if (!proc_create_data( | 685 | if (!proc_create_data( |
358 | "resubmit_runlist", 0222, dir, compat_ops(&resubmit_runlist_file_ops), | 686 | "preempt_runlist", 0222, dir, compat_ops(&preempt_runlist_file_ops), |
687 | (void*)device_id)) | ||
688 | goto out_nomem; | ||
689 | // Create file `/proc/gpu#/ack_bad_tsg`, world writable | ||
690 | if (!proc_create_data( | ||
691 | "ack_bad_tsg", 0222, dir, compat_ops(&ack_bad_tsg_file_ops), | ||
359 | (void*)device_id)) | 692 | (void*)device_id)) |
360 | goto out_nomem; | 693 | goto out_nomem; |
361 | // Create file `/proc/gpu#/device_info`, world readable | 694 | // Create file `/proc/gpu#/device_info`, world readable |
@@ -394,6 +727,68 @@ int __init nvdebug_init(void) { | |||
394 | (void*)NV_FUSE_GPC_GM107)) | 727 | (void*)NV_FUSE_GPC_GM107)) |
395 | goto out_nomem; | 728 | goto out_nomem; |
396 | } | 729 | } |
730 | // Create file `/proc/gpu#/CWD_SM_ID#`, world readable (Maxwell+) | ||
731 | // Create file `/proc/gpu#/CWD_GPC_TPC_ID#`, world readable (Maxwell+) | ||
732 | // - 6 entries on Maxwell (nvgpu) | ||
733 | // - 16 entries on Pascal through Ampere (at least) (nvgpu, open-gpu-doc) | ||
734 | // - 24 entries on Hopper through Ada (at least) (XXXX) | ||
735 | // XXX: Only working while a context is active | ||
736 | // XXX: Needed for libsmctrl2; hacky | ||
737 | // Tested on GP104, TU102, GV100, AD102 | ||
738 | if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_HOPPER) { | ||
739 | char file_name[21]; | ||
740 | long i; | ||
741 | for (i = 0; i < 24; i++) { | ||
742 | snprintf(file_name, 20, "CWD_SM_ID%ld", i); | ||
743 | if (!proc_create_data( | ||
744 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | ||
745 | (void*)(0x00405100+4*i))) // XXX: From XXXX | ||
746 | goto out_nomem; | ||
747 | // 18 entries on Ada (RTX 6000 Ada) | ||
748 | // Returns 0xbadf1201 if GPU not active | ||
749 | snprintf(file_name, 20, "CWD_GPC_TPC_ID%ld", i); | ||
750 | if (!proc_create_data( | ||
751 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | ||
752 | // Nothing between this location and CWD_SM_ID | ||
753 | (void*)((0x00405000)+4*i))) // Found via reverse search from CWD_SM_ID location on Ada | ||
754 | goto out_nomem; | ||
755 | // Nothing in the following 28 words (before 0x00405220) | ||
756 | } | ||
757 | } else if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL) { | ||
758 | char file_name[21]; | ||
759 | long i; | ||
760 | union reg_range num_gpc_range; | ||
761 | for (i = 0; i < 16; i++) { | ||
762 | snprintf(file_name, 20, "CWD_SM_ID%ld", i); | ||
763 | if (!proc_create_data( | ||
764 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | ||
765 | (void*)(0x00405ba0+4*i))) // NV_PGRAPH_PRI_CWD_SM_ID(i) | ||
766 | goto out_nomem; | ||
767 | // ? entries on Maxwell | ||
768 | // 8 entries on Pascal (test) | ||
769 | // 16 entries on Volta through Ampere (open-gpu-doc) | ||
770 | // Returns 0 if GPU ont active | ||
771 | snprintf(file_name, 20, "CWD_GPC_TPC_ID%ld", i); | ||
772 | if (!proc_create_data( | ||
773 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | ||
774 | (void*)(0x00405b60+4*i))) // NV_PGRAPH_PRI_CWD_GPC_TPC_ID(i) | ||
775 | goto out_nomem; | ||
776 | } | ||
777 | num_gpc_range.offset = 0x00405b00; // NV_PGRAPH_PRI_CWD_FS | ||
778 | // Lower eight bits of register are _NUM_GPCS | ||
779 | num_gpc_range.start_bit = 0; | ||
780 | num_gpc_range.stop_bit = 8; | ||
781 | if (!proc_create_data( | ||
782 | "CWD_FS_NUM_GPCS", 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops), | ||
783 | (void*)(num_gpc_range.raw))) | ||
784 | goto out_nomem; | ||
785 | num_gpc_range.start_bit = 8; | ||
786 | num_gpc_range.stop_bit = 16; | ||
787 | if (!proc_create_data( | ||
788 | "CWD_FS_NUM_TPCS", 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops), | ||
789 | (void*)(num_gpc_range.raw))) | ||
790 | goto out_nomem; | ||
791 | } | ||
397 | // Create file `/proc/gpu#/local_memory`, world readable (Pascal+) | 792 | // Create file `/proc/gpu#/local_memory`, world readable (Pascal+) |
398 | if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) { | 793 | if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) { |
399 | if (!proc_create_data( | 794 | if (!proc_create_data( |
@@ -414,6 +809,50 @@ int __init nvdebug_init(void) { | |||
414 | (void*)NV_CE_PCE_MAP)) | 809 | (void*)NV_CE_PCE_MAP)) |
415 | goto out_nomem; | 810 | goto out_nomem; |
416 | } | 811 | } |
812 | // Create files exposing subcontext partitioning (Volta+) | ||
813 | // TODO: Make this not a hack with undocumented magic numbers | ||
814 | if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_VOLTA) { | ||
815 | char file_name[21]; | ||
816 | long i; | ||
817 | // Create file `/proc/gpu#/partition_ctl`, world readable | ||
818 | if (!proc_create_data( | ||
819 | "partition_ctl", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | ||
820 | (void*)0x00405b2c)) | ||
821 | goto out_nomem; | ||
822 | // Create file `/proc/gpu#/partition_data`, world readable | ||
823 | if (!proc_create_data( | ||
824 | "partition_data", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | ||
825 | (void*)0x00405b30)) | ||
826 | goto out_nomem; | ||
827 | // Create file `/proc/gpu#/partition_data#`, world readable | ||
828 | for (i = 0; i < 64; i++) { | ||
829 | snprintf(file_name, 20, "partition_data%ld", i); | ||
830 | if (!proc_create_data( | ||
831 | file_name, 0444, dir, compat_ops(&nvdebug_read_part_file_ops), | ||
832 | (void*)i)) | ||
833 | goto out_nomem; | ||
834 | } | ||
835 | // For debugging what MPS is changing | ||
836 | // Create file `/proc/gpu#/CWD_CG0`, world readable | ||
837 | if (!proc_create_data( | ||
838 | "CWD_CG0", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | ||
839 | (void*)0x00405bf0)) | ||
840 | goto out_nomem; | ||
841 | // Create file `/proc/gpu#/CWD_CG1`, world readable | ||
842 | if (!proc_create_data( | ||
843 | "CWD_CG1", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | ||
844 | (void*)0x00405bf4)) | ||
845 | goto out_nomem; | ||
846 | // Create file `/proc/gpu#/CWD_GPC_TPC_ID#`, world readable | ||
847 | // This does not appear to work on Hopper. Works on Ampere. | ||
848 | /*for (i = 0; i < 16; i++) { | ||
849 | snprintf(file_name, 20, "CWD_GPC_TPC_ID%ld", i); | ||
850 | if (!proc_create_data( | ||
851 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | ||
852 | (void*)(0x00405b60+4*i))) | ||
853 | goto out_nomem; | ||
854 | }*/ | ||
855 | } | ||
417 | } | 856 | } |
418 | // (See Makefile if you want to know the origin of GIT_HASH.) | 857 | // (See Makefile if you want to know the origin of GIT_HASH.) |
419 | printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n"); | 858 | printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n"); |
@@ -439,16 +878,19 @@ static void __exit nvdebug_exit(void) { | |||
439 | char device_id[7]; | 878 | char device_id[7]; |
440 | snprintf(device_id, 7, "gpu%d", g_nvdebug_devices); | 879 | snprintf(device_id, 7, "gpu%d", g_nvdebug_devices); |
441 | remove_proc_subtree(device_id, NULL); | 880 | remove_proc_subtree(device_id, NULL); |
881 | // Force-free associated allocations | ||
442 | g = &g_nvdebug_state[g_nvdebug_devices]; | 882 | g = &g_nvdebug_state[g_nvdebug_devices]; |
883 | gc_page_directory(g, true); | ||
443 | // Free BAR mappings for PCIe devices | 884 | // Free BAR mappings for PCIe devices |
444 | if (g && g->pcid) { | 885 | if (g && g->pcid) { |
886 | #ifdef INTERRUPT_DEBUG | ||
887 | // IRQ handler uses g->regs, so free IRQ first | ||
888 | free_irq(g->pcid->irq, g); | ||
889 | #endif // INTERRUPT_DEBUG | ||
445 | if (g->regs) | 890 | if (g->regs) |
446 | pci_iounmap(g->pcid, g->regs); | 891 | pci_iounmap(g->pcid, g->regs); |
447 | if (g->bar2) | 892 | if (g->bar2) |
448 | pci_iounmap(g->pcid, g->bar2); | 893 | pci_iounmap(g->pcid, g->bar2); |
449 | #if INTERRUPT_DEBUG | ||
450 | free_irq(g->pcid->irq, g->pcid); | ||
451 | #endif // INTERRUPT_DEBUG | ||
452 | } else { | 894 | } else { |
453 | if (g->regs) | 895 | if (g->regs) |
454 | iounmap(g->regs); | 896 | iounmap(g->regs); |
diff --git a/nvdebug_linux.h b/nvdebug_linux.h index 2ad4ce1..b232720 100644 --- a/nvdebug_linux.h +++ b/nvdebug_linux.h | |||
@@ -20,6 +20,11 @@ static inline struct gk20a *get_gk20a(struct device *dev) { | |||
20 | #define pde_data PDE_DATA | 20 | #define pde_data PDE_DATA |
21 | #endif | 21 | #endif |
22 | 22 | ||
23 | // iommu_map() requires an extra parameter on Linux 6.3+ | ||
24 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(6,3,0) | ||
25 | #define iommu_map(a, b, c, d, e) iommu_map(a, b, c, d, e, GFP_KERNEL) | ||
26 | #endif | ||
27 | |||
23 | // We us the data field of the proc_dir_entry ("PDE" in this function) to store | 28 | // We us the data field of the proc_dir_entry ("PDE" in this function) to store |
24 | // our index into the g_nvdebug_state array | 29 | // our index into the g_nvdebug_state array |
25 | static inline int seq2gpuidx(struct seq_file *s) { | 30 | static inline int seq2gpuidx(struct seq_file *s) { |
@@ -1,19 +1,13 @@ | |||
1 | /* Copyright 2024 Joshua Bakita | 1 | /* Copyright 2024 Joshua Bakita |
2 | * Helpers for dealing with the runlist and other Host (PFIFO) registers | 2 | * Helpers for dealing with the runlist and other Host (PFIFO) registers |
3 | */ | 3 | */ |
4 | #include <linux/iommu.h> // iommu_get_domain_for_dev() and iommu_iova_to_phys() | ||
4 | #include <linux/printk.h> // For printk() | 5 | #include <linux/printk.h> // For printk() |
5 | #include <asm/errno.h> // For error defines | 6 | #include <asm/errno.h> // For error defines |
6 | #include <asm/io.h> // For phys_to_virt() | 7 | #include <asm/io.h> // For phys_to_virt() |
7 | 8 | ||
8 | #include "nvdebug.h" | 9 | #include "nvdebug.h" |
9 | 10 | ||
10 | // Uncomment to, upon BAR2 access failure, return a PRAMIN-based runlist pointer | ||
11 | // in get_runlist_iter(). In order for this pointer to remain valid, PRAMIN | ||
12 | // **must** not be moved during runlist traversal. | ||
13 | // The Jetson TX2 has no BAR2, and stores the runlist in VID_MEM, so this must | ||
14 | // be enabled to print the runlist on the TX2. | ||
15 | //#define FALLBACK_TO_PRAMIN | ||
16 | |||
17 | /* Get RunList RAM (RLRAM) offset for a runlist from the device topology | 11 | /* Get RunList RAM (RLRAM) offset for a runlist from the device topology |
18 | @param rl_id Which runlist to obtain [numbered in order of appearance in | 12 | @param rl_id Which runlist to obtain [numbered in order of appearance in |
19 | the device topology (PTOP) registers] | 13 | the device topology (PTOP) registers] |
@@ -116,6 +110,7 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl | |||
116 | runlist_len = submit.len; | 110 | runlist_len = submit.len; |
117 | printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n", | 111 | printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n", |
118 | rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw); | 112 | rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw); |
113 | printk(KERN_INFO "[nvdebug] Runlist offset is %d\n", submit.offset); | ||
119 | rl_iter->runlist_pri_base = runlist_pri_base; | 114 | rl_iter->runlist_pri_base = runlist_pri_base; |
120 | } | 115 | } |
121 | // Return early on an empty runlist | 116 | // Return early on an empty runlist |
@@ -130,6 +125,12 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl | |||
130 | if ((err = get_bar2_pdb(g, &pd_config)) < 0) | 125 | if ((err = get_bar2_pdb(g, &pd_config)) < 0) |
131 | goto attempt_pramin_access; | 126 | goto attempt_pramin_access; |
132 | 127 | ||
128 | // XXX: PD version detection not working on Hopper [is_ver2 errantly (?) unset] | ||
129 | if (g->chip_id >= NV_CHIP_ID_HOPPER && g->chip_id < NV_CHIP_ID_ADA) { | ||
130 | printk(KERN_WARNING "[nvdebug] V3 page tables do not currently work on Hopper! Mystery config: %llx\n", pd_config.raw); | ||
131 | err = -EOPNOTSUPP; | ||
132 | goto attempt_pramin_access; | ||
133 | } | ||
133 | if (pd_config.is_ver2) | 134 | if (pd_config.is_ver2) |
134 | runlist_bar_vaddr = search_page_directory(g, pd_config, runlist_iova, TARGET_VID_MEM); | 135 | runlist_bar_vaddr = search_page_directory(g, pd_config, runlist_iova, TARGET_VID_MEM); |
135 | else | 136 | else |
@@ -233,7 +234,7 @@ int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id) { | |||
233 | } | 234 | } |
234 | 235 | ||
235 | // Read and write runlist configuration, triggering a resubmit | 236 | // Read and write runlist configuration, triggering a resubmit |
236 | int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) { | 237 | int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id, uint32_t off) { |
237 | // Necessary registers do not exist pre-Fermi | 238 | // Necessary registers do not exist pre-Fermi |
238 | if (g->chip_id < NV_CHIP_ID_FERMI) | 239 | if (g->chip_id < NV_CHIP_ID_FERMI) |
239 | return -EOPNOTSUPP; | 240 | return -EOPNOTSUPP; |
@@ -252,6 +253,9 @@ int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) { | |||
252 | return -EINVAL; | 253 | return -EINVAL; |
253 | if ((submit.raw = nvdebug_readq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id))) == -1) | 254 | if ((submit.raw = nvdebug_readq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id))) == -1) |
254 | return -EIO; | 255 | return -EIO; |
256 | preempt_runlist(g, rl_id); | ||
257 | if (off != -1) | ||
258 | submit.offset = off; | ||
255 | nvdebug_writeq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id), submit.raw); | 259 | nvdebug_writeq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id), submit.raw); |
256 | } else { | 260 | } else { |
257 | int err; | 261 | int err; |
@@ -261,6 +265,9 @@ int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) { | |||
261 | return err; | 265 | return err; |
262 | if ((submit.raw = nvdebug_readq(g, runlist_pri_base + NV_RUNLIST_SUBMIT_GA100)) == -1) | 266 | if ((submit.raw = nvdebug_readq(g, runlist_pri_base + NV_RUNLIST_SUBMIT_GA100)) == -1) |
263 | return -EIO; | 267 | return -EIO; |
268 | preempt_runlist(g, rl_id); | ||
269 | if (off != -1) | ||
270 | submit.offset = off; | ||
264 | // On Ampere, this does not appear to trigger a preempt of the | 271 | // On Ampere, this does not appear to trigger a preempt of the |
265 | // currently-running channel (even if the currently running channel | 272 | // currently-running channel (even if the currently running channel |
266 | // becomes disabled), but will cause newly re-enabled channels | 273 | // becomes disabled), but will cause newly re-enabled channels |
@@ -270,3 +277,255 @@ int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) { | |||
270 | } | 277 | } |
271 | return 0; | 278 | return 0; |
272 | } | 279 | } |
280 | |||
281 | /* Get a CPU-accessible pointer to an arbitrary-address-space instance block | ||
282 | @param instance_addr Address of instance block | ||
283 | @param intasce_target Aperture/taget of instance block address | ||
284 | @return A dereferencable KVA, NULL if not found, or an ERR_PTR-wrapped error | ||
285 | |||
286 | Note: The returned address will be a BAR2 or physical address, mapped into | ||
287 | kernel space, /not/ a PRAMIN-derived address. Thus, the returned | ||
288 | address will have an indefinite lifetime, and will be uneffected by use | ||
289 | of PRAMIN elsewhere (such as to read the CTXSW block). | ||
290 | */ | ||
291 | instance_ctrl_t *instance_deref(struct nvdebug_state *g, uint64_t instance_addr, | ||
292 | enum INST_TARGET instance_target) { | ||
293 | if (!instance_addr || instance_target == TARGET_INVALID) | ||
294 | return ERR_PTR(-EINVAL); | ||
295 | if (instance_target == TARGET_VID_MEM) { | ||
296 | int err; | ||
297 | uint64_t inst_bar_vaddr; | ||
298 | page_dir_config_t pd_config; | ||
299 | // Only access VID_MEM via BAR2; do not fall back to PRAMIN | ||
300 | if (!g->bar2) | ||
301 | return NULL; | ||
302 | // Find page tables which define how BAR2/3 offsets are translated to | ||
303 | // physical VID/SYS_MEM addresses. | ||
304 | if ((err = get_bar2_pdb(g, &pd_config)) < 0) { | ||
305 | printk(KERN_ERR "[nvdebug] Error: Unable to access page directory " | ||
306 | "configuration for BAR2/3. Error %d.\n", err); | ||
307 | return ERR_PTR(err); | ||
308 | } | ||
309 | // Search the BAR2/3 page tables for the offset at which the instance | ||
310 | // block is mapped (reverse translation). | ||
311 | if (pd_config.is_ver2) | ||
312 | inst_bar_vaddr = search_page_directory(g, pd_config, instance_addr, instance_target); | ||
313 | else | ||
314 | inst_bar_vaddr = search_v1_page_directory(g, pd_config, instance_addr, instance_target); | ||
315 | if (!inst_bar_vaddr) { | ||
316 | printk(KERN_WARNING "[nvdebug] Warning: Instance block %#018llx " | ||
317 | "(%s) appears unmapped in BAR2/3.\n", instance_addr, | ||
318 | target_to_text(instance_target)); | ||
319 | return NULL; | ||
320 | } | ||
321 | return g->bar2 + inst_bar_vaddr; | ||
322 | } else { | ||
323 | struct iommu_domain *dom; | ||
324 | // SYS_MEM addresses are physical addresses *from the perspective of | ||
325 | // the device* ("bus addresses"), and may not necessarially correspond | ||
326 | // to physical addresses from the perspective of the CPU. The I/O MMU | ||
327 | // is responsible for mapping bus addresses to CPU-relative physical | ||
328 | // addresses when there is no direct correspondence. If an I/O MMU is | ||
329 | // enabled on this GPU, ask it to translate the bus address to a | ||
330 | // CPU-relative physical address. | ||
331 | if ((dom = iommu_get_domain_for_dev(g->dev))) { | ||
332 | // XXX: As of Aug 2024, this is not tested, so include extra logging | ||
333 | printk(KERN_DEBUG "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#llx for instance block", instance_addr); | ||
334 | if (!(instance_addr = iommu_iova_to_phys(dom, instance_addr))) { | ||
335 | printk(KERN_ERR "[nvdebug] Error: I/O MMU failed to translate " | ||
336 | "%#018llx (%s) to a CPU-relative physical address.\n", | ||
337 | instance_addr, target_to_text(instance_target)); | ||
338 | return ERR_PTR(-EADDRNOTAVAIL); | ||
339 | } | ||
340 | printk(KERN_DEBUG " to physical address %#llx.\n", instance_addr); | ||
341 | } | ||
342 | // Convert from a physical address to a kernel virtual address (KVA) | ||
343 | return phys_to_virt(instance_addr); | ||
344 | } | ||
345 | } | ||
346 | |||
347 | /* Get a CPU-accessible pointer to the CTXSW block for a channel intance block | ||
348 | @param inst Dereferencable pointer to the start of a complete instance block | ||
349 | @return A dereferencable KVA, NULL if not found, or an ERR_PTR-wrapped error | ||
350 | |||
351 | Note: The returned address **will** be a PRAMIN-based address. Any changes to | ||
352 | PRAMIN **will** invalidate the returned pointer. `inst` **cannot** be a | ||
353 | pointer into the PRAMIN space. | ||
354 | */ | ||
355 | context_switch_ctrl_t *get_ctxsw(struct nvdebug_state *g, | ||
356 | instance_ctrl_t *inst) { | ||
357 | int err; | ||
358 | context_switch_ctrl_t *wfi = NULL; | ||
359 | uint64_t wfi_virt, wfi_phys, ctxsw_virt, ctxsw_phys; | ||
360 | enum INST_TARGET wfi_phys_aperture, ctxsw_phys_aperture; | ||
361 | |||
362 | // The WFI block contains a pointer to the CTXSW block, which contains the | ||
363 | // preemption mode configuration for the context. (As best I can tell, the WFI | ||
364 | // block is subcontext-specific, whereas the CTXSW block is context-wide. | ||
365 | wfi_virt = (uint64_t)inst->engine_wfi_ptr << 12; | ||
366 | |||
367 | // WFI may not be configured | ||
368 | if (!wfi_virt) | ||
369 | goto out; | ||
370 | |||
371 | // Determine the physical location of the WFI block | ||
372 | if (inst->engine_wfi_is_virtual) { | ||
373 | if (inst->pdb.is_ver2) | ||
374 | err = translate_page_directory(g, inst->pdb, wfi_virt, &wfi_phys, &wfi_phys_aperture); | ||
375 | else | ||
376 | err = translate_v1_page_directory(g, inst->pdb, wfi_virt, &wfi_phys, &wfi_phys_aperture); | ||
377 | if (err) { | ||
378 | printk(KERN_ERR "[nvdebug] Critical: Inconsistent GPU state; WFI block " | ||
379 | "pointer %#018llx (virt) cannot be found in process page tables! " | ||
380 | "Translation error %d.\n", wfi_virt, -err); | ||
381 | return ERR_PTR(-ENOTRECOVERABLE); | ||
382 | } | ||
383 | } else { | ||
384 | wfi_phys = (uint64_t)inst->engine_wfi_ptr << 12; | ||
385 | wfi_phys_aperture = inst->engine_wfi_target; | ||
386 | } | ||
387 | |||
388 | // Get a dereferencible pointer to the WFI block (the WFI and CTXSW blocks | ||
389 | // have not been observed as mapped in BAR2/3, so we use the PRAMIN window). | ||
390 | // Note: On Jetson boards, we could attempt to avoid PRAMIN since CTXSW is in | ||
391 | // SYS_MEM, but this function will always need to use PRAMIN to work | ||
392 | // around the WFI and CTXSW blocks not being accessible via BAR2/3 on | ||
393 | // PCIe GPU, so always use PRAMIN for simplicity. | ||
394 | if ((wfi_phys = addr_to_pramin_mut(g, wfi_phys, wfi_phys_aperture)) == -1) | ||
395 | goto out; | ||
396 | wfi = g->regs + wfi_phys + NV_PRAMIN; | ||
397 | |||
398 | // XXX | ||
399 | // return wfi; | ||
400 | // End XXX | ||
401 | |||
402 | // While the WFI block uses the same layout as the context switch (CTXSW) | ||
403 | // control block, it is mostly unpopulated except for a few pointers on GPUs | ||
404 | // after Volta. This appears to be related to subcontexts, where each | ||
405 | // subcontext has its own WFI block containing a pointer to the overarching | ||
406 | // CTXSW block. Only attempt to find the overarching CTXSW block if at least | ||
407 | // one subcontext is enabled. | ||
408 | if (inst->subcontext_pdb_valid) { | ||
409 | // Subcontexts are Volta+-only. Volta only supports Page Table Ver. 2 | ||
410 | if (!inst->pdb.is_ver2) | ||
411 | return ERR_PTR(-ENOTRECOVERABLE); | ||
412 | // Obtain the address of the CXTSW block in this context | ||
413 | ctxsw_virt = wfi->context_buffer_ptr_hi; | ||
414 | ctxsw_virt <<= 32; | ||
415 | ctxsw_virt |= wfi->context_buffer_ptr_lo; | ||
416 | if (!ctxsw_virt) { | ||
417 | printk(KERN_WARNING "[nvdebug] Warning: WFI block at %#018llx (phys) " | ||
418 | "contains an empty context block pointer.\n", wfi_phys); | ||
419 | goto out; | ||
420 | } | ||
421 | |||
422 | // All the pointers in the WFI block are virtual, so convert the CTXSW | ||
423 | // block pointer to a physical address. We should always be able to find a | ||
424 | // mapping for ctxsw_virt. | ||
425 | if ((err = translate_page_directory(g, inst->pdb, ctxsw_virt, &ctxsw_phys, &ctxsw_phys_aperture))) { | ||
426 | printk(KERN_ERR "[nvdebug] Critical: Inconsistent GPU state; context " | ||
427 | "block pointer %#018llx (virt) cannot be found in process page " | ||
428 | "tables! Translation error %d.\n", ctxsw_virt, -err); | ||
429 | return ERR_PTR(-ENOTRECOVERABLE); | ||
430 | } | ||
431 | |||
432 | // Get a dereferencible pointer to the CTXSW block (via PRAMIN; invalidates `wfi`) | ||
433 | if ((ctxsw_phys = addr_to_pramin_mut(g, ctxsw_phys, ctxsw_phys_aperture)) == -1) | ||
434 | goto out; | ||
435 | return g->regs + ctxsw_phys + NV_PRAMIN; | ||
436 | } else { | ||
437 | // Without subcontexts, the WFI block is the CTXSW block (ex: Pascal) | ||
438 | return wfi; | ||
439 | } | ||
440 | out: | ||
441 | return NULL; | ||
442 | } | ||
443 | |||
444 | /* Change the preemption type to be used on a context's budget expiration | ||
445 | @param chan_id As context IDs are hard to obtain and use, this function takes | ||
446 | a channel ID and looks up and modifies the associated context. | ||
447 | @param rl_id Which channel RAM address space is this channel ID in? (Not | ||
448 | used on pre-Ampere GPUs.) | ||
449 | @param mode Preemption mode to set. | ||
450 | @return 0 or -errno on error | ||
451 | |||
452 | Note: This change will not apply if the channel's context has running work, | ||
453 | or if the GPU is idle and this channel's context was last to run. | ||
454 | Please ensure some other task is running before calling this API. | ||
455 | */ | ||
456 | int set_channel_preemption_mode(struct nvdebug_state *g, uint32_t chan_id, | ||
457 | uint32_t rl_id, | ||
458 | enum COMPUTE_PREEMPT_TYPE mode) { | ||
459 | uint64_t instance_ptr = 0; | ||
460 | enum INST_TARGET instance_target; | ||
461 | instance_ctrl_t *inst = NULL; | ||
462 | context_switch_ctrl_t *ctxsw = NULL; | ||
463 | struct runlist_iter rl_iter; | ||
464 | uint32_t ctxsw_timeout_pri_base = NV_PFIFO_ENG_CTXSW_TIMEOUT; | ||
465 | // Obtain the instance block | ||
466 | if (g->chip_id < NV_CHIP_ID_AMPERE) { | ||
467 | // Pre-Ampere, Channel RAM includes instance block pointers | ||
468 | channel_ctrl_t chan; | ||
469 | if (chan_id > MAX_CHID) | ||
470 | return -ERANGE; | ||
471 | if ((chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chan_id))) == -1) | ||
472 | return -EIO; | ||
473 | instance_ptr = (uint64_t)chan.inst_ptr << 12; | ||
474 | instance_target = chan.inst_target; | ||
475 | } else { | ||
476 | // Starting with Ampere, instance block pointers are only included in | ||
477 | // runlist entries. Something like this could work on Maxwell+, but | ||
478 | // access via Channel RAM is more heavily-tested. | ||
479 | struct gv100_runlist_chan* chan; | ||
480 | int err; | ||
481 | loff_t pos = 0; | ||
482 | // Based off logic of switch_to_tsg_file_write() in runlist_procfs.c | ||
483 | if ((err = get_runlist_iter(g, rl_id, &rl_iter))) | ||
484 | return err; | ||
485 | while (pos < rl_iter.len && !instance_ptr) { | ||
486 | for_chan_in_tsg(g, chan, rl_iter.curr_entry) { | ||
487 | if (chan_id == chid(g, chan)) { | ||
488 | // Channel entry found in runlist. Extract instance ptr. | ||
489 | instance_ptr = (uint64_t)chan->inst_ptr_hi << 32; | ||
490 | instance_ptr |= (uint64_t)inst_ptr_lo(g, chan) << 12; | ||
491 | instance_target = inst_target(g, chan); | ||
492 | break; | ||
493 | } | ||
494 | } | ||
495 | pos += 1 + tsg_length(g, rl_iter.curr_entry); | ||
496 | rl_iter.curr_entry = next_tsg(g, rl_iter.curr_entry); | ||
497 | } | ||
498 | // Context switch timeout configuration register was moved with Ampere+ | ||
499 | ctxsw_timeout_pri_base = rl_iter.runlist_pri_base + NV_RUNLIST_ENGINE_CTXSW_TIMEOUT_CONFIG(0); | ||
500 | } | ||
501 | if (!instance_ptr) | ||
502 | return -ENOENT; | ||
503 | // Obtain an instance block pointer routed via BAR2 or SYS_MEM | ||
504 | inst = instance_deref(g, instance_ptr, instance_target); | ||
505 | if (IS_ERR_OR_NULL(inst)) | ||
506 | return PTR_ERR(inst); | ||
507 | // Obtain pointer to CTXSW block routed via PRAMIN (the CTXSW block | ||
508 | // does not appear to be mapped into BAR2). | ||
509 | ctxsw = get_ctxsw(g, inst); | ||
510 | if (IS_ERR_OR_NULL(ctxsw)) | ||
511 | return PTR_ERR(ctxsw); | ||
512 | ctxsw->compute_preemption_options = mode; | ||
513 | // If switching to a preemption mode that runs blocks or kernels non- | ||
514 | // -preemptively (CTA-level and WFI respectively), disable the context switch | ||
515 | // timeout. If switching to compute-instruction-level preemption (CILP), | ||
516 | // reenable it. Observed to be necessary on (at least) gv11b, tu102, and ga10b | ||
517 | // XXX: On ga10b (at least), the timeout configuration is reset on a resume | ||
518 | // from suspend, overwriting the change made here. This causes a CTXSW | ||
519 | // TIMEOUT interrupt to be triggered if any application tries to run | ||
520 | // non-preemptively for longer than the timeout period (3100ms on gv11b | ||
521 | // and ga10b). | ||
522 | if (g->chip_id >= NV_CHIP_ID_VOLTA) { | ||
523 | ctxsw_timeout_t timeout_config; | ||
524 | if ((timeout_config.raw = nvdebug_readl(g, ctxsw_timeout_pri_base)) == -1) | ||
525 | return -EIO; | ||
526 | printk(KERN_DEBUG "[nvdebug] Previous Ctx. Sw. Timeout Configuration: period %d %s\n", timeout_config.period, timeout_config.enabled ? "enabled" : "disabled"); | ||
527 | timeout_config.enabled = mode == PREEMPT_CILP; | ||
528 | nvdebug_writel(g, ctxsw_timeout_pri_base, timeout_config.raw); | ||
529 | } | ||
530 | return 0; | ||
531 | } | ||
diff --git a/runlist_procfs.c b/runlist_procfs.c index b2159f6..a3a6df3 100644 --- a/runlist_procfs.c +++ b/runlist_procfs.c | |||
@@ -1,12 +1,117 @@ | |||
1 | #include <linux/seq_file.h> // For seq_* functions and types | 1 | #include <linux/seq_file.h> // For seq_* functions and types |
2 | #include <linux/version.h> // Macros to detect kernel version | 2 | #include <linux/version.h> // Macros to detect kernel version |
3 | #include <linux/platform_device.h> // For platform_get_resource() | ||
4 | #include <linux/pci.h> // For pci_resource_start() | ||
5 | #include <linux/iommu.h> // For iommu_ functions | ||
6 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,10,0) | ||
7 | #include <linux/dma-map-ops.h> // For get_dma_ops() | ||
8 | #endif | ||
3 | 9 | ||
4 | #include "nvdebug_linux.h" | 10 | #include "nvdebug_linux.h" |
5 | 11 | ||
6 | // Uncomment to expand channel status information when printing the runlist | 12 | // We cannot touch PRAMIN (via page table operations or ctxsw access) if we're |
13 | // using it to walk the runlist | ||
14 | //#ifndef FALLBACK_TO_PRAMIN | ||
15 | // Uncomment to expand channel status, instance, and context information when | ||
16 | // printing the runlist | ||
7 | #define DETAILED_CHANNEL_INFO | 17 | #define DETAILED_CHANNEL_INFO |
18 | //#endif | ||
8 | 19 | ||
9 | #ifdef DETAILED_CHANNEL_INFO | 20 | #ifdef DETAILED_CHANNEL_INFO |
21 | // Print the channel instance and context swtich blocks | ||
22 | // XXX: THIS IS UNSAFE ON KEPLER! | ||
23 | // instance_deref() will call into the page table logic, which may move PRAMIN | ||
24 | // PRAMIN appears heavily utilized by the driver on Bonham (at least), and | ||
25 | // moving it causes problems. | ||
26 | static int runlist_detail_seq_show_inst(struct seq_file *s, struct nvdebug_state *g, char *prefix, uint64_t instance_ptr, enum INST_TARGET instance_target) { | ||
27 | instance_ctrl_t *inst = NULL; | ||
28 | context_switch_ctrl_t *ctxsw = NULL; | ||
29 | int i; | ||
30 | |||
31 | #ifdef FALLBACK_TO_PRAMIN | ||
32 | bar0_window_t win; | ||
33 | win.raw = nvdebug_readl(g, NV_XAL_EP_BAR0_WINDOW_BASE); | ||
34 | inst = g->regs + NV_PRAMIN + addr_to_pramin_mut(g, instance_ptr, instance_target); | ||
35 | #else | ||
36 | if (IS_ERR(inst = instance_deref(g, instance_ptr, instance_target))) | ||
37 | return PTR_ERR(ctxsw); | ||
38 | #endif // FALLBACK_TO_PRAMIN | ||
39 | // If unable to access instance block, skip | ||
40 | if (!inst) | ||
41 | return 0; | ||
42 | |||
43 | // Print the channel instance block | ||
44 | // As an ID, use upper 52 bits of the instance address (lower 12 are zero) | ||
45 | //seq_printf(s, "%s+- Inst %-13llx-+\n", prefix, instance_ptr >> 12); | ||
46 | seq_printf(s, "%s|= Instance Block ====|\n", prefix); | ||
47 | seq_printf(s, "%s| Target Engine: %2d|\n", prefix, inst->fc_target); | ||
48 | seq_printf(s, "%s| Privileged: %1d|\n", prefix, inst->fc_config_is_priv); | ||
49 | seq_printf(s, "%s| Channel VEID: %2d|\n", prefix, inst->fc_chan_info_veid); | ||
50 | seq_printf(s, "%s| WFI PTR: |\n", prefix); | ||
51 | seq_printf(s, "%s| %#018llx|\n", prefix, (uint64_t)inst->engine_wfi_ptr << 12); | ||
52 | seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->engine_wfi_target)); | ||
53 | seq_printf(s, "%s| Virtual address? %d|\n", prefix, inst->engine_wfi_is_virtual); | ||
54 | seq_printf(s, "%s| WFI VEID: %2d|\n", prefix, inst->engine_wfi_veid); | ||
55 | seq_printf(s, "%s| All PDB PTR: |\n", prefix); | ||
56 | seq_printf(s, "%s| %#018llx|\n", prefix, (u64)inst->pdb.page_dir << 12); | ||
57 | seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->pdb.target)); | ||
58 | seq_printf(s, "%s| %20s|\n", prefix, inst->pdb.is_volatile ? "volatile" : "non-volatile"); | ||
59 | // seq_printf(s, "%s|raw: %0#10lx|\n", prefix, inst->pdb.raw); | ||
60 | seq_printf(s, "%s| Num subcontexts: %2ld|\n", prefix, hweight64(inst->subcontext_pdb_valid)); | ||
61 | // Print configuration of every enabled subcontext | ||
62 | for (i = 0; i < 64; i++) { | ||
63 | // Skip subcontexts without their enable bit set | ||
64 | if (!(1 & (inst->subcontext_pdb_valid >> i))) | ||
65 | continue; | ||
66 | seq_printf(s, "%s| CPU SC%02d ASID%7d|\n", prefix, i, inst->subcontext[i].pasid); | ||
67 | seq_printf(s, "%s| SC%02d PDB PTR: |\n", prefix, i); | ||
68 | seq_printf(s, "%s| %#018llx|\n", prefix, ((u64)inst->subcontext[i].pdb.page_dir_hi << 32) | ((u64)inst->subcontext[i].pdb.page_dir_lo << 12)); | ||
69 | seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->subcontext[i].pdb.target)); | ||
70 | seq_printf(s, "%s| %20s|\n", prefix, inst->subcontext[i].pdb.is_volatile ? "volatile" : "non-volatile"); | ||
71 | // seq_printf(s, "%s|raw: %0#10lx|\n", prefix, inst->subcontext[i].pdb.raw); | ||
72 | } | ||
73 | |||
74 | // XXX: CTXSW is only accessible via PRAMIN. Accessing PRAMIN appears to | ||
75 | // either be broken, or race with the driver on Kepler (gk104 tested). So, | ||
76 | // do not attempt to touch the CTXSW block on Kepler. | ||
77 | // TODO: This check should be moved into addr_to_pramin_mut(). | ||
78 | if (g->chip_id < NV_CHIP_ID_MAXWELL) | ||
79 | return 0; | ||
80 | // End XXX | ||
81 | |||
82 | if (IS_ERR(ctxsw = get_ctxsw(g, inst))) { | ||
83 | #ifdef FALLBACK_TO_PRAMIN | ||
84 | nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw); | ||
85 | #endif | ||
86 | return PTR_ERR(ctxsw); | ||
87 | } | ||
88 | // If unable to access CTXSW block, skip | ||
89 | if (!ctxsw) { | ||
90 | #ifdef FALLBACK_TO_PRAMIN | ||
91 | nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw); | ||
92 | #endif | ||
93 | return 0; | ||
94 | } | ||
95 | // Access and print the preemption mode and context ID | ||
96 | seq_printf(s, "%s|= Context State =====|\n", prefix); | ||
97 | seq_printf(s, "%s| Ctx. ID: %#10x|\n", prefix, ctxsw->context_id); | ||
98 | // No other CTXSW fields are supported pre-Pascal | ||
99 | if (g->chip_id < NV_CHIP_ID_PASCAL) | ||
100 | return 0; | ||
101 | seq_printf(s, "%s| Gfx. Preemption:%4s|\n", prefix, | ||
102 | graphics_preempt_type_to_text(ctxsw->graphics_preemption_options)); | ||
103 | seq_printf(s, "%s| Cmp. Preemption:%4s|\n", prefix, | ||
104 | compute_preempt_type_to_text(ctxsw->compute_preemption_options)); | ||
105 | seq_printf(s, "%s| #WFI Saves:%9d|\n", prefix, ctxsw->num_wfi_save_operations); | ||
106 | seq_printf(s, "%s| #CTA Saves:%9d|\n", prefix, ctxsw->num_cta_save_operations); | ||
107 | seq_printf(s, "%s| #GFXP Saves:%8d|\n", prefix, ctxsw->num_gfxp_save_operations); | ||
108 | seq_printf(s, "%s| #CILP Saves:%8d|\n", prefix, ctxsw->num_cilp_save_operations); | ||
109 | #ifdef FALLBACK_TO_PRAMIN | ||
110 | nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw); | ||
111 | #endif | ||
112 | return 0; | ||
113 | } | ||
114 | |||
10 | /* Print channel details using PCCSR (Programmable Channel Control System RAM?) | 115 | /* Print channel details using PCCSR (Programmable Channel Control System RAM?) |
11 | @param s Pointer to state from seq_file subsystem to pass to seq_printf | 116 | @param s Pointer to state from seq_file subsystem to pass to seq_printf |
12 | @param g Pointer to our internal GPU state | 117 | @param g Pointer to our internal GPU state |
@@ -32,16 +137,19 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state | |||
32 | seq_printf(s, "%s| %#018llx|\n", prefix, instance_ptr); | 137 | seq_printf(s, "%s| %#018llx|\n", prefix, instance_ptr); |
33 | seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target)); | 138 | seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target)); |
34 | seq_printf(s, "%s| Instance bound: %d|\n", prefix, chan.inst_bind); | 139 | seq_printf(s, "%s| Instance bound: %d|\n", prefix, chan.inst_bind); |
35 | return 0; | 140 | // Print instance block |
141 | return runlist_detail_seq_show_inst(s, g, prefix, instance_ptr, chan.inst_target); | ||
36 | } | 142 | } |
37 | 143 | ||
38 | /* `runlist_detail_seq_show_chan()`, but for Ampere+ | 144 | /* `runlist_detail_seq_show_chan()`, but for Ampere+ |
145 | @param instance_ptr Address for the channel instance block | ||
146 | @param instance_target Aperture of `instance_ptr` | ||
39 | @param runlist_pri_base Base of the RLRAM region for this runlist | 147 | @param runlist_pri_base Base of the RLRAM region for this runlist |
40 | 148 | ||
41 | `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on | 149 | `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on |
42 | Ampere+, and its location is configured in Runlist RAM. | 150 | Ampere+, and its location is configured in Runlist RAM. |
43 | */ | 151 | */ |
44 | static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base) { | 152 | static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base, uint64_t instance_ptr, enum INST_TARGET instance_target) { |
45 | runlist_channel_config_t channel_config; | 153 | runlist_channel_config_t channel_config; |
46 | channel_ctrl_ga100_t chan; | 154 | channel_ctrl_ga100_t chan; |
47 | 155 | ||
@@ -63,7 +171,7 @@ static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug | |||
63 | seq_printf(s, "%s| PBDMA Busy: %d|\n", prefix, chan.pbdma_busy); | 171 | seq_printf(s, "%s| PBDMA Busy: %d|\n", prefix, chan.pbdma_busy); |
64 | seq_printf(s, "%s| ENG Busy: %d|\n", prefix, chan.eng_busy); | 172 | seq_printf(s, "%s| ENG Busy: %d|\n", prefix, chan.eng_busy); |
65 | seq_printf(s, "%s| Acquire Fail: %d|\n", prefix, chan.acquire_fail); | 173 | seq_printf(s, "%s| Acquire Fail: %d|\n", prefix, chan.acquire_fail); |
66 | return 0; | 174 | return runlist_detail_seq_show_inst(s, g, prefix, instance_ptr, instance_target); |
67 | } | 175 | } |
68 | #endif | 176 | #endif |
69 | 177 | ||
@@ -173,7 +281,7 @@ static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) { | |||
173 | if (g->chip_id < NV_CHIP_ID_AMPERE) | 281 | if (g->chip_id < NV_CHIP_ID_AMPERE) |
174 | runlist_detail_seq_show_chan(s, g, chid(g, entry), indt); | 282 | runlist_detail_seq_show_chan(s, g, chid(g, entry), indt); |
175 | else | 283 | else |
176 | runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base); | 284 | runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base, instance_ptr, inst_target(g, entry)); |
177 | #endif | 285 | #endif |
178 | seq_printf(s, "%s+---------------------+\n", indt); | 286 | seq_printf(s, "%s+---------------------+\n", indt); |
179 | } | 287 | } |
@@ -232,15 +340,17 @@ struct file_operations preempt_tsg_file_ops = { | |||
232 | 340 | ||
233 | ssize_t resubmit_runlist_file_write(struct file *f, const char __user *buffer, | 341 | ssize_t resubmit_runlist_file_write(struct file *f, const char __user *buffer, |
234 | size_t count, loff_t *off) { | 342 | size_t count, loff_t *off) { |
235 | uint32_t target_runlist; | 343 | uint32_t target_runlist, target_offset; |
236 | struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)]; | 344 | struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)]; |
237 | // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec | 345 | // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec |
238 | int err = kstrtou32_from_user(buffer, count, 0, &target_runlist); | 346 | int err = kstrtou32_from_user(buffer, count, 0, &target_offset); |
239 | if (err) | 347 | if (err) |
240 | return err; | 348 | return err; |
349 | // (Ab)use the PDE_DATA field for the runlist ID | ||
350 | target_runlist = file2gpuidx(f); | ||
241 | 351 | ||
242 | // resubmit_runlist() checks that target_runlist is valid | 352 | // resubmit_runlist() checks that target_runlist is valid |
243 | if ((err = resubmit_runlist(g, target_runlist))) | 353 | if ((err = resubmit_runlist(g, target_runlist, target_offset))) |
244 | return err; | 354 | return err; |
245 | 355 | ||
246 | return count; | 356 | return count; |
@@ -351,6 +461,54 @@ struct file_operations enable_channel_file_ops = { | |||
351 | .llseek = default_llseek, | 461 | .llseek = default_llseek, |
352 | }; | 462 | }; |
353 | 463 | ||
464 | ssize_t comm_preempt_channel_file_write(struct file *f, const char __user *buf, | ||
465 | size_t count, loff_t *off, | ||
466 | enum COMPUTE_PREEMPT_TYPE mode) { | ||
467 | uint32_t target_channel, target_runlist; | ||
468 | struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; | ||
469 | // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec | ||
470 | int err = kstrtou32_from_user(buf, count, 0, &target_channel); | ||
471 | if (err) | ||
472 | return err; | ||
473 | // (Ab)use the PDE_DATA field used by file2gpuidx() for the runlist ID | ||
474 | target_runlist = file2gpuidx(f); | ||
475 | // Set preemption mode for the context of this channel | ||
476 | if ((err = set_channel_preemption_mode(g, target_channel, target_runlist, mode))) | ||
477 | return err; | ||
478 | |||
479 | return count; | ||
480 | } | ||
481 | |||
482 | ssize_t wfi_preempt_channel_file_write(struct file *f, const char __user *buf, | ||
483 | size_t count, loff_t *off) { | ||
484 | return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_WFI); | ||
485 | } | ||
486 | |||
487 | struct file_operations wfi_preempt_channel_file_ops = { | ||
488 | .write = wfi_preempt_channel_file_write, | ||
489 | .llseek = default_llseek, | ||
490 | }; | ||
491 | |||
492 | ssize_t cta_preempt_channel_file_write(struct file *f, const char __user *buf, | ||
493 | size_t count, loff_t *off) { | ||
494 | return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_CTA); | ||
495 | } | ||
496 | |||
497 | struct file_operations cta_preempt_channel_file_ops = { | ||
498 | .write = cta_preempt_channel_file_write, | ||
499 | .llseek = default_llseek, | ||
500 | }; | ||
501 | |||
502 | ssize_t cil_preempt_channel_file_write(struct file *f, const char __user *buf, | ||
503 | size_t count, loff_t *off) { | ||
504 | return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_CILP); | ||
505 | } | ||
506 | |||
507 | struct file_operations cil_preempt_channel_file_ops = { | ||
508 | .write = cil_preempt_channel_file_write, | ||
509 | .llseek = default_llseek, | ||
510 | }; | ||
511 | |||
354 | // Tested working on Pascal (gp106) through Ada (ad102) | 512 | // Tested working on Pascal (gp106) through Ada (ad102) |
355 | ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer, | 513 | ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer, |
356 | size_t count, loff_t *off) { | 514 | size_t count, loff_t *off) { |
@@ -419,11 +577,13 @@ ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer, | |||
419 | 577 | ||
420 | // TODO: Fix the above for bare channels. Add "for_chan_until_tsg"? | 578 | // TODO: Fix the above for bare channels. Add "for_chan_until_tsg"? |
421 | } | 579 | } |
580 | #warning switch_to_tsg has preempt_runlist omitted! | ||
581 | return count; | ||
422 | 582 | ||
423 | // Resubmit the runlist to ensure that changes to channel enablement are | 583 | // Resubmit the runlist to ensure that changes to channel enablement are |
424 | // picked up on Turing+ GPUs (channel enablements may not be otherwise). | 584 | // picked up on Turing+ GPUs (channel enablements may not be otherwise). |
425 | if (g->chip_id >= NV_CHIP_ID_TURING) | 585 | if (g->chip_id >= NV_CHIP_ID_TURING) |
426 | if ((err = resubmit_runlist(g, target_runlist))) | 586 | if ((err = resubmit_runlist(g, target_runlist, -1))) |
427 | return err; | 587 | return err; |
428 | 588 | ||
429 | // Trigger a runlist-level preempt to stop whatever was running, triggering | 589 | // Trigger a runlist-level preempt to stop whatever was running, triggering |
@@ -438,3 +598,470 @@ struct file_operations switch_to_tsg_file_ops = { | |||
438 | .write = switch_to_tsg_file_write, | 598 | .write = switch_to_tsg_file_write, |
439 | .llseek = default_llseek, | 599 | .llseek = default_llseek, |
440 | }; | 600 | }; |
601 | |||
602 | ssize_t preempt_runlist_file_write(struct file *f, const char __user *buffer, | ||
603 | size_t count, loff_t *off) { | ||
604 | uint32_t target_runlist; | ||
605 | struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)]; | ||
606 | // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec | ||
607 | int err = kstrtou32_from_user(buffer, count, 0, &target_runlist); | ||
608 | if (err) | ||
609 | return err; | ||
610 | |||
611 | // TODO: Check runlist is in-range | ||
612 | if ((err = preempt_runlist(g, target_runlist))) | ||
613 | return err; | ||
614 | |||
615 | return count; | ||
616 | } | ||
617 | |||
618 | struct file_operations preempt_runlist_file_ops = { | ||
619 | .write = preempt_runlist_file_write, | ||
620 | .llseek = default_llseek, | ||
621 | }; | ||
622 | |||
623 | // Value written to this file is which runlist to ack the IRQ for | ||
624 | ssize_t ack_bad_tsg_file_write(struct file *f, const char __user *buffer, | ||
625 | size_t count, loff_t *off) { | ||
626 | uint32_t target_runlist; | ||
627 | uint32_t rl_ram_off; | ||
628 | struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)]; | ||
629 | // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec | ||
630 | int err = kstrtou32_from_user(buffer, count, 0, &target_runlist); | ||
631 | if (err) | ||
632 | return err; | ||
633 | |||
634 | if ((err = get_runlist_ram(g, target_runlist, &rl_ram_off))) | ||
635 | return err; | ||
636 | |||
637 | nvdebug_writel(g, rl_ram_off + 0x100, 1 << 12); | ||
638 | |||
639 | return count; | ||
640 | } | ||
641 | |||
642 | struct file_operations ack_bad_tsg_file_ops = { | ||
643 | .write = ack_bad_tsg_file_write, | ||
644 | .llseek = default_llseek, | ||
645 | }; | ||
646 | |||
647 | // Rather than mapping all of BAR0, we just map: | ||
648 | // - On Pascal, Volta, Turing: MC_BOOT, PFIFO, PCCSR, PTOP | ||
649 | // - On Ampere: MC_BOOT, RAMRL(0), CHRAM(0), PTOP | ||
650 | // "All CUDA-managed pointers are within---the first 40 bits of the process's | ||
651 | // VA space" (Sec. 4.1, GPUDirect RDMA Documentation) | ||
652 | // - This means 0x00ff_ffff_ffff is the highest valid CUDA virtual address, | ||
653 | // and all higher addresses are unused. | ||
654 | // - So we use 0x6000_0000_0000+; this falls within the first PDE3 entry, and | ||
655 | // at the end of the PDE2 entries | ||
656 | // + Using the second PDE3 entry did not appear to work on Jetson (IIRC) | ||
657 | #define BAR0_USER_ADDR 0x0000700000000000llu | ||
658 | #define MEM_USER_ADDR 0x0000600000000000llu | ||
659 | |||
660 | /* Map all of GPU VRAM, and selected BAR0 regions, into a channel instance's | ||
661 | * virtual address space at predefined offsets (above). | ||
662 | * | ||
663 | * @param g Pointer to the nvdebug state for the selected GPU | ||
664 | * @param inst_ptr Dereferencible pointer to the channel's instance block | ||
665 | * @returns 0 on success, -errno on error | ||
666 | * | ||
667 | * Support: Pascal, Volta, Turing, Ampere | ||
668 | */ | ||
669 | int map_mem_for_instance(struct nvdebug_state *g, instance_ctrl_t *inst_ptr) { | ||
670 | int ret; | ||
671 | uintptr_t off, ram_size; | ||
672 | dma_addr_t bus_mc_boot_ram, bus_ptop_ram, bus_fifo_ram, bus_chan_ctrl_ram; | ||
673 | uint64_t mc_boot_ram, ptop_ram, fifo_ram, chan_ctrl_ram; | ||
674 | page_dir_config_t chan_pd_config; | ||
675 | memory_range_t mem_range; | ||
676 | uint32_t channel_ram_off, runlist_ram_off, channel_ram_size, bar0_base; | ||
677 | struct iommu_domain *dom; | ||
678 | |||
679 | if (g->chip_id >= NV_CHIP_ID_AMPERE) { | ||
680 | runlist_channel_config_t channel_config; | ||
681 | if ((ret = get_runlist_ram(g, 0, &runlist_ram_off))) { | ||
682 | printk(KERN_ERR "[nvdebug] %s: Unable to determine location of runlist0 RAM!\n", __func__); | ||
683 | return ret; | ||
684 | } | ||
685 | if (runlist_ram_off & 0xfff) { | ||
686 | printk(KERN_ERR "[nvdebug] %s: Runlist0 RAM is not page-aligned!\n", __func__); | ||
687 | return -EAFNOSUPPORT; | ||
688 | } | ||
689 | if ((channel_config.raw = nvdebug_readl(g, runlist_ram_off + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1) | ||
690 | return -EIO; | ||
691 | channel_ram_off = (uint32_t)channel_config.bar0_offset << 4; | ||
692 | if (channel_ram_off & 0xfff) { | ||
693 | printk(KERN_ERR "[nvdebug] %s: Runlist0 CHRAM is not page-aligned!\n", __func__); | ||
694 | return -EAFNOSUPPORT; | ||
695 | } | ||
696 | channel_ram_size = (1 << channel_config.num_channels_log2) * sizeof(channel_ctrl_ga100_t); | ||
697 | printk(KERN_DEBUG "[nvdebug] %s: Mapping CHRAM at %#018llx--%x and RLRAM at %#018llx--%x.\n", __func__, BAR0_USER_ADDR + channel_ram_off, channel_ram_size-1, BAR0_USER_ADDR + runlist_ram_off, 4095); | ||
698 | } else { | ||
699 | channel_ram_off = NV_PCCSR; | ||
700 | // MAX_CHID * sizeof(channel_ctrl_gf100_t) is < 4 KiB, so hardcode | ||
701 | channel_ram_size = 4096; | ||
702 | runlist_ram_off = NV_PFIFO; | ||
703 | } | ||
704 | |||
705 | // map_mem_by_chid() pulls the instance block via PRAMIN, so inst_ptr will | ||
706 | // be invalid after moving PRAMIN (eg. as part of a page table operation). | ||
707 | // To avoid accessing inst_ptr after invalidation, keep a copy of what we | ||
708 | // need. | ||
709 | chan_pd_config = inst_ptr->pdb; | ||
710 | |||
711 | // map_page_directory_v1() is unimplemented, precluding Maxwell (or older) | ||
712 | // support (as they don't support v2 page tables). | ||
713 | if (!chan_pd_config.is_ver2) | ||
714 | return -EOPNOTSUPP; | ||
715 | |||
716 | // Determine the size of GPU physical memory (VRAM). | ||
717 | if ((mem_range.raw = nvdebug_readl(g, NV_FB_MMU_LOCAL_MEMORY_RANGE)) == -1) | ||
718 | return -EIO; | ||
719 | ram_size = memory_range_to_bytes(mem_range); | ||
720 | |||
721 | // We map memory using huge pages, and thus do not support GPUs with | ||
722 | // non-2-MiB-divisible VID_MEM sizes. | ||
723 | if (ram_size % (1 << 21) != 0) { | ||
724 | printk(KERN_ERR "[nvdebug] %s: GPU VID_MEM of %lu bytes is not a multiple of 2 MiB!\n", __func__, ram_size); | ||
725 | return -EAFNOSUPPORT; | ||
726 | } | ||
727 | |||
728 | // Map all of physical GPU memory (VID_MEM) into this channels's GPU virtual | ||
729 | // address space using huge (2 MiB) pages. | ||
730 | for (off = 0; off < ram_size; off += (1 << 21)) { | ||
731 | if ((ret = map_page_directory(g, chan_pd_config, | ||
732 | MEM_USER_ADDR + off, off, TARGET_VID_MEM, true)) < 0) | ||
733 | return ret; | ||
734 | // If the mapping already exists for this page directory, the other | ||
735 | // mappings should already exist, and can be skipped. | ||
736 | if (ret == 1) { | ||
737 | printk(KERN_INFO "[nvdebug] %s: VRAM mapping from %llx to %lx already exists. Assuming all mappings already exist and returning early...\n", __func__, MEM_USER_ADDR + off, off); | ||
738 | return 0; | ||
739 | } | ||
740 | } | ||
741 | |||
742 | // Map Channel RAM to a GPU-accessible bus address (gets past any IOMMU or | ||
743 | // IOVA layers), then map that address into this channel's GPU virtual | ||
744 | // address space. NV_PCCSR_CHANNEL_INST(0) is 4k-aligned, so it can be | ||
745 | // directly mapped. | ||
746 | // XXX: All these mappings are currently returning -1 on all reads on | ||
747 | // sunlight, jbakita-old, jetson-xavier, jetson-orin, and bonham, | ||
748 | // which seems to be returned from the PCIe root (on PCIe GPUs). | ||
749 | if (g->pcid) | ||
750 | bar0_base = pci_resource_start(g->pcid, 0); | ||
751 | else if (g->platd) | ||
752 | bar0_base = platform_get_resource(g->platd, IORESOURCE_MEM, 0)->start; | ||
753 | else | ||
754 | return -ENOTRECOVERABLE; | ||
755 | mc_boot_ram = NV_MC_BOOT_0 + bar0_base; | ||
756 | // PTOP fits within a page, but not page-aligned; round down. | ||
757 | ptop_ram = (NV_PTOP & ~0xfffu) + bar0_base; | ||
758 | fifo_ram = runlist_ram_off + bar0_base; | ||
759 | chan_ctrl_ram = channel_ram_off + bar0_base; | ||
760 | |||
761 | // Check if GPU-accessible bus addresses are the same as CPU-visible physical | ||
762 | // addresses. Logic from amdgpu_device_check_iommu_direct_map(). | ||
763 | dom = iommu_get_domain_for_dev(g->dev); | ||
764 | if (!dom || dom->type == IOMMU_DOMAIN_IDENTITY) { | ||
765 | // Used for: jbakita-old, sunlight, jetson-xavier, jetson-orin integrated, bonham, ? | ||
766 | // (For all these, reads on the mapping return only -1.) | ||
767 | // (Forcing these through dma_map_resource()/iommu_map() changes nothing) | ||
768 | // (Note that the `ls -l /sys/class/iommu/*/devices` also reports that the | ||
769 | // GPU is not available under the I/O MMU on these platforms.) | ||
770 | // To fix this, please enable AMD-Vi/ARM SMMU/Intel VT-d in your BIOS | ||
771 | // settings, UEFI settings, or device-tree file. Supported on: | ||
772 | // - AMD: Bulldozer+ (or Phenom II w/ 890FX or 990FX Chipset) | ||
773 | // - Intel: Most since Core2 Duo | ||
774 | // Note that while the Jetson Orin has an SMMU (I/O MMU), the GPU does not | ||
775 | // appear to be configured by any pre-provided device tree files to use the | ||
776 | // SMMU. | ||
777 | printk(KERN_INFO "[nvdebug] map_mem_ctxid: I/O MMU is unavailable/disabled for GPU %x. Assuming phys and bus addresses are identical...\n", g->chip_id); | ||
778 | bus_mc_boot_ram = mc_boot_ram; | ||
779 | bus_ptop_ram = ptop_ram; | ||
780 | bus_fifo_ram = fifo_ram; | ||
781 | bus_chan_ctrl_ram = chan_ctrl_ram; | ||
782 | } else { | ||
783 | printk(KERN_INFO "[nvdebug] map_mem_ctxid: I/O MMU is enabled. Attempting to use dma_map_resource()...\n"); | ||
784 | // Used for: tama, yamaha | ||
785 | // Fails on tama, yamaha | ||
786 | // (Works on jetson-xavier, jetson-orin and bonham, but appears to be a no-op, and | ||
787 | // yields inaccessible memory. Get `mc-err: (255) csr_nvl7r: EMEM address decode error` | ||
788 | // on access on jetson boards, and a -1 read on all.) | ||
789 | bus_mc_boot_ram = dma_map_resource(g->dev, mc_boot_ram, 4096*2 /* *2 is a XXX hack to include PBUS */, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC); | ||
790 | bus_ptop_ram = dma_map_resource(g->dev, ptop_ram, 4096, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC); | ||
791 | bus_fifo_ram = dma_map_resource(g->dev, fifo_ram, 4096*8 /* *8 is a XXX hack */, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC); | ||
792 | bus_chan_ctrl_ram = dma_map_resource(g->dev, chan_ctrl_ram, 2*4096, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC); | ||
793 | if (dma_mapping_error(g->dev, bus_mc_boot_ram) || | ||
794 | dma_mapping_error(g->dev, bus_ptop_ram) || | ||
795 | dma_mapping_error(g->dev, bus_fifo_ram) || | ||
796 | dma_mapping_error(g->dev, bus_chan_ctrl_ram)) { | ||
797 | // Used for: tama, yamaha | ||
798 | printk(KERN_WARNING "[nvdebug] map_mem_ctxid: Unable to map BAR0 addresses to device-accessible addresses via dma_map_resource(). Return codes: %d for MC_BOOT, %d for PFIFO, %d for PCCSR.\n", | ||
799 | dma_mapping_error(g->dev, bus_mc_boot_ram), | ||
800 | dma_mapping_error(g->dev, bus_fifo_ram), | ||
801 | dma_mapping_error(g->dev, bus_chan_ctrl_ram)); | ||
802 | // This fallback does not appear to work on jbakita-old (5.4, GART IOMMU), but works on tama | ||
803 | if (!get_dma_ops(g->dev)) | ||
804 | printk(KERN_WARNING "[nvdebug] Reason: No DMA `ops`, and direct mapping failed.\n"); | ||
805 | else if (!get_dma_ops(g->dev)->map_resource) | ||
806 | // Fires on: tama, yamaha | ||
807 | printk(KERN_WARNING "[nvdebug] Reason: `map_resource` function undefined on this platform.\n"); | ||
808 | if (!dom) { | ||
809 | printk(KERN_ERR "[nvdebug] map_mem_ctxid: No I/O MMU available and dma_map_resource() failed. Aborting mapping of BAR0 regions!\n"); | ||
810 | return -ENOTRECOVERABLE; | ||
811 | } | ||
812 | printk(KERN_INFO "[nvdebug] map_mem_ctxid: Trying to fall back to direct I/O MMU manipulation...\n"); | ||
813 | // XXX: Fallback to directly creating the I/O MMU mappings. | ||
814 | // This is necessary. Directly accessing BAR0 addresses throws I/O MMU | ||
815 | // errors in the kernel log on yamaha. | ||
816 | // See also: comment on kfd_mem_dmamap_sg_bo() in amdgpu | ||
817 | // Note: dma_map_resource -> map_resource -> [arm_]iommu_map_resource | ||
818 | // -> __iommu_dma_map -> iommu_map is the happy-path, but this seems to | ||
819 | // regularly fail, even though the iommu_map path works. One key | ||
820 | // difference is that the dma_map_resource() path also includes | ||
821 | // IOMMU_MMIO in the iommu_map() flags. | ||
822 | bus_mc_boot_ram = mc_boot_ram; | ||
823 | bus_ptop_ram = ptop_ram; | ||
824 | bus_fifo_ram = fifo_ram; | ||
825 | bus_chan_ctrl_ram = chan_ctrl_ram; | ||
826 | // Create identity mapping | ||
827 | ret = iommu_map(dom, mc_boot_ram, mc_boot_ram, 4096*2 /* *2 is a hack to fit in PBUS*/, IOMMU_READ | IOMMU_WRITE); | ||
828 | if (ret < 0) { | ||
829 | printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for MC_BOOT!\n"); | ||
830 | return ret; | ||
831 | } | ||
832 | ret = iommu_map(dom, ptop_ram, ptop_ram, 4096, IOMMU_READ | IOMMU_WRITE); | ||
833 | if (ret < 0) { | ||
834 | printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for PTOP!\n"); | ||
835 | return ret; | ||
836 | } | ||
837 | ret = iommu_map(dom, fifo_ram, fifo_ram, 4096*8 /* *8 is XXX hack*/, IOMMU_READ | IOMMU_WRITE); | ||
838 | if (ret < 0) { | ||
839 | printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for FIFO!\n"); | ||
840 | return ret; | ||
841 | } | ||
842 | ret = iommu_map(dom, chan_ctrl_ram, chan_ctrl_ram, channel_ram_size, IOMMU_READ | IOMMU_WRITE); | ||
843 | if (ret < 0) { | ||
844 | printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for PCCSR!\n"); | ||
845 | return ret; | ||
846 | } | ||
847 | } | ||
848 | } | ||
849 | // TARGET_SYS_MEM_NONCOHERENT tells the GPU to bypass the CPU L2 cache for | ||
850 | // accesses to this memory. | ||
851 | // "Clients should normally use [SYS_MEM_NON_COHERENT]" (nvgpu) | ||
852 | // | ||
853 | // "Non-coherent system memory. | ||
854 | // (GPU) MMU will NOT maintain coherence with CPU L2 cache. | ||
855 | // Higher-level APIs should only allow this when it is known | ||
856 | // the memory is not cacheable by CPU or the coherency is | ||
857 | // managed explicitly (e.g. w/ flushes in SW). | ||
858 | // Also consider that this path is not necessarily faster." (open-gpu-kernel-modules) | ||
859 | // | ||
860 | // "Coherent system memory. | ||
861 | // (GPU) MMU will snoop CPU L2 cache if possible. | ||
862 | // This is usually the safer choice over NONCOH since it works | ||
863 | // whether the memory is cached by CPU L2 or not. | ||
864 | // On some CPU architectures going through CPU L2 may | ||
865 | // even be faster than the non-coherent path." (open-gpu-kernel-modules) | ||
866 | // | ||
867 | // I suspect that that for SYS_MEM_NONCOHERENT mappings, the "no snoop" | ||
868 | // attribute bit will be set on associated PCIe read/write transactions. | ||
869 | // | ||
870 | // The only other bits in a PCIe read/write transaction that could be | ||
871 | // relevant are the two AT (Address Translation) bits added in PCIe 2.0. | ||
872 | if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + NV_MC_BOOT_0, | ||
873 | bus_mc_boot_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0) | ||
874 | return ret; | ||
875 | // XXX | ||
876 | if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + NV_MC_BOOT_0 + 4096, | ||
877 | bus_mc_boot_ram + 4096, TARGET_SYS_MEM_NONCOHERENT, false)) < 0) | ||
878 | return ret; | ||
879 | if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + (NV_PTOP & ~0xfffu), | ||
880 | bus_ptop_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0) | ||
881 | return ret; | ||
882 | if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + runlist_ram_off, | ||
883 | bus_fifo_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0) | ||
884 | return ret; | ||
885 | // XXX | ||
886 | for (off = 4096; off < 8*4096; off += 4096) | ||
887 | if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + runlist_ram_off+off, | ||
888 | bus_fifo_ram+off, TARGET_SYS_MEM_NONCOHERENT, false)) < 0) | ||
889 | return ret; | ||
890 | // Channel control RAM can span two or more pages on Ampere+ | ||
891 | for (off = 0; off < channel_ram_size; off += 4096) | ||
892 | if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + channel_ram_off + off, | ||
893 | bus_chan_ctrl_ram + off, TARGET_SYS_MEM_NONCOHERENT, false)) < 0) | ||
894 | return ret; | ||
895 | return 0; | ||
896 | } | ||
897 | |||
898 | // Map by context ID | ||
899 | // See constituent functions for info on what they do; comments not repeated. | ||
900 | // Tested on Pascal, Volta, Turing, and Kepler | ||
901 | ssize_t map_mem_ctxid_file_write(struct file *f, const char __user *buffer, | ||
902 | size_t count, loff_t *off) { | ||
903 | int err, target_context, target_runlist; | ||
904 | loff_t pos; | ||
905 | uint64_t instance_ptr; | ||
906 | enum INST_TARGET instance_target; | ||
907 | struct runlist_iter rl_iter; | ||
908 | instance_ctrl_t *inst; | ||
909 | context_switch_ctrl_t *ctx_block; | ||
910 | struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; | ||
911 | // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec | ||
912 | if ((err = kstrtou32_from_user(buffer, count, 0, &target_context))) | ||
913 | return err; | ||
914 | target_runlist = file2gpuidx(f); | ||
915 | |||
916 | // Get dereferencable pointer to the runlist | ||
917 | if ((err = get_runlist_iter(g, target_runlist, &rl_iter))) | ||
918 | return err; | ||
919 | // Find a channel in the runlist matching the provided context ID | ||
920 | for (pos = 0; pos < rl_iter.len; pos++, rl_iter.curr_entry += NV_RL_ENTRY_SIZE(g)) { | ||
921 | uint32_t ctxsw_timeout_pri_base = NV_PFIFO_ENG_CTXSW_TIMEOUT; | ||
922 | if (entry_type(g, rl_iter.curr_entry) == ENTRY_TYPE_TSG) | ||
923 | continue; | ||
924 | // Get instance block address | ||
925 | if (g->chip_id >= NV_CHIP_ID_AMPERE) { | ||
926 | instance_ptr = ((struct gv100_runlist_chan*)rl_iter.curr_entry)->inst_ptr_hi; | ||
927 | instance_ptr <<= 32; | ||
928 | instance_ptr |= (uint64_t)inst_ptr_lo(g, rl_iter.curr_entry) << 12; | ||
929 | instance_target = inst_target(g, rl_iter.curr_entry); | ||
930 | ctxsw_timeout_pri_base = rl_iter.runlist_pri_base + NV_RUNLIST_ENGINE_CTXSW_TIMEOUT_CONFIG(0); | ||
931 | } else { | ||
932 | channel_ctrl_t chan; | ||
933 | chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chid(g, rl_iter.curr_entry))); | ||
934 | if (chan.raw == -1) | ||
935 | return -EIO; | ||
936 | instance_ptr = (uint64_t)chan.inst_ptr << 12; | ||
937 | instance_target = chan.inst_target; | ||
938 | } | ||
939 | // Skip channels with unconfigured or INVALID instance blocks | ||
940 | if (!instance_ptr || instance_target == 1) { | ||
941 | printk(KERN_WARNING "[nvdebug] Channel %d is in runlist %d, but " | ||
942 | "lacks a valid instance block", chid(g, rl_iter.curr_entry), | ||
943 | target_runlist); | ||
944 | continue; | ||
945 | } | ||
946 | |||
947 | // Get a dereferencable pointer to the instance block | ||
948 | if (IS_ERR(inst = instance_deref(g, instance_ptr, instance_target))) | ||
949 | return PTR_ERR(inst); | ||
950 | // If unable to access instance block, skip | ||
951 | if (!inst) | ||
952 | continue; | ||
953 | |||
954 | // Get dereferencable pointer to CTXSW block | ||
955 | if (IS_ERR(ctx_block = get_ctxsw(g, inst))) | ||
956 | return PTR_ERR(ctx_block); | ||
957 | // If unable to access CTXSW block, skip | ||
958 | if (!ctx_block) | ||
959 | continue; | ||
960 | // Check if the context ID matches | ||
961 | if (ctx_block->context_id != target_context) | ||
962 | continue; | ||
963 | |||
964 | // XXX: Disable the context switch timeout while we're here | ||
965 | ctxsw_timeout_t timeout_config; | ||
966 | if ((timeout_config.raw = nvdebug_readl(g, ctxsw_timeout_pri_base)) == -1) | ||
967 | return -EIO; | ||
968 | timeout_config.enabled = 0; | ||
969 | nvdebug_writel(g, ctxsw_timeout_pri_base, timeout_config.raw); | ||
970 | // XXX: Attempt setting preemption mode while we're here | ||
971 | ctx_block->compute_preemption_options = PREEMPT_CTA; | ||
972 | |||
973 | // Map memory and return | ||
974 | if ((err = map_mem_for_instance(g, inst)) < 0) | ||
975 | return err; | ||
976 | return count; | ||
977 | } | ||
978 | return -ESRCH; | ||
979 | } | ||
980 | |||
981 | struct file_operations map_mem_ctxid_file_ops = { | ||
982 | .write = map_mem_ctxid_file_write, | ||
983 | .llseek = default_llseek, | ||
984 | }; | ||
985 | |||
986 | // Map by channel ID (LEGACY; unclear if this needs to be kept) | ||
987 | // Support: Pascal, Volta, and Turing only | ||
988 | ssize_t map_mem_chid_file_write(struct file *f, const char __user *buffer, | ||
989 | size_t count, loff_t *off) { | ||
990 | int ret, target_channel; | ||
991 | struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; | ||
992 | channel_ctrl_t chan; | ||
993 | instance_ctrl_t *inst_ptr; | ||
994 | bool all = false; | ||
995 | uint64_t inst_ptr_off; | ||
996 | page_dir_config_t bar2_pd_config; | ||
997 | // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec | ||
998 | if ((ret = kstrtos32_from_user(buffer, count, 0, &target_channel))) | ||
999 | return ret; | ||
1000 | |||
1001 | if (g->chip_id >= NV_CHIP_ID_AMPERE) | ||
1002 | return -ENOSYS; | ||
1003 | |||
1004 | // This API is for nvsched, which is only supported on GPUs which support | ||
1005 | // instruction-level preemption (Pascal+). | ||
1006 | if (g->chip_id < NV_CHIP_ID_PASCAL) | ||
1007 | return -EOPNOTSUPP; | ||
1008 | |||
1009 | if (target_channel > MAX_CHID) | ||
1010 | return -ERANGE; | ||
1011 | |||
1012 | // Passing -1 indicates that all channels should be mapped | ||
1013 | if (target_channel == -1) { | ||
1014 | all = true; | ||
1015 | target_channel = 0; | ||
1016 | } | ||
1017 | |||
1018 | do { | ||
1019 | printk(KERN_INFO "[nvdebug] Mapping channel %d\n", target_channel); | ||
1020 | // Read the channel's configuration block, which includes the address of | ||
1021 | // this channel's instance block, which contains a page table pointer. | ||
1022 | // TODO: Verify this works with the channel RAM changes on Ampere+ | ||
1023 | chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel)); | ||
1024 | if (chan.raw == -1) | ||
1025 | return -EIO; | ||
1026 | |||
1027 | // If the instance pointer is unconfigured or the target is 1 (INVALID), | ||
1028 | // this channel is not in-use on any runlist and can be skipped. | ||
1029 | if (chan.inst_ptr == 0 || chan.inst_target == 1) | ||
1030 | continue; | ||
1031 | |||
1032 | // Find page tables which define how BAR2 offsets are tranlated to physical | ||
1033 | // VID_MEM/SYS_MEM addresses. (We have to do this every time since we reset | ||
1034 | // PRAMIN.) | ||
1035 | if ((ret = get_bar2_pdb(g, &bar2_pd_config)) < 0) | ||
1036 | return ret; | ||
1037 | |||
1038 | // Pascal+ GPUs use Version 2 page tables, so this shouldn't be a problem | ||
1039 | if (!bar2_pd_config.is_ver2) | ||
1040 | return -ENOSYS; | ||
1041 | |||
1042 | // To read the instance block, first find where it is mapped in BAR2 | ||
1043 | if ((inst_ptr_off = search_page_directory(g, bar2_pd_config, (u64)chan.inst_ptr << 12, chan.inst_target)) == 0) { | ||
1044 | // If no mapping can be found in BAR2, fallback to accessing the | ||
1045 | // instance block via the PRAMIN window. | ||
1046 | printk(KERN_WARNING "[nvdebug] Warning: Channel %d has no instance " | ||
1047 | "block mapped in BAR2. Falling back to PRAMIN...\n", target_channel); | ||
1048 | if ((ret = addr_to_pramin_mut(g, (u64)chan.inst_ptr << 12, chan.inst_target)) < 0) | ||
1049 | return -EOPNOTSUPP; | ||
1050 | inst_ptr = g->regs + NV_PRAMIN + ret; | ||
1051 | } else { | ||
1052 | inst_ptr = g->bar2 + inst_ptr_off; | ||
1053 | } | ||
1054 | |||
1055 | if ((ret = map_mem_for_instance(g, inst_ptr))) | ||
1056 | return ret; | ||
1057 | |||
1058 | // If mapping all channels, start again at the next one | ||
1059 | } while (all && ++target_channel <= MAX_CHID); | ||
1060 | |||
1061 | return count; | ||
1062 | } | ||
1063 | |||
1064 | struct file_operations map_mem_chid_file_ops = { | ||
1065 | .write = map_mem_chid_file_write, | ||
1066 | .llseek = default_llseek, | ||
1067 | }; | ||