From 306a03d18b305e4e573be3b2931978fa10679eb9 Mon Sep 17 00:00:00 2001 From: Joshua Bakita Date: Thu, 22 Jun 2023 12:52:59 -0400 Subject: Quick dump of current state for Ben to review. --- Makefile | 13 +- device_info_procfs.c | 126 +++++++++ mmu.c | 251 ++++++++++++++++++ nvdebug.h | 719 +++++++++++++++++++++++++++++++++++++++++++++++---- nvdebug_entry.c | 288 ++++++++++++++++++--- runlist.c | 221 ++++++++-------- runlist_procfs.c | 188 ++++++++------ stubs.h | 80 ++++++ 8 files changed, 1614 insertions(+), 272 deletions(-) create mode 100644 device_info_procfs.c create mode 100644 mmu.c create mode 100644 stubs.h diff --git a/Makefile b/Makefile index 18c07e8..2dc90c7 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,14 @@ obj-m += nvdebug.o -nvdebug-objs = runlist_procfs.o runlist.o nvdebug_entry.o +nvdebug-objs = runlist_procfs.o device_info_procfs.o runlist.o mmu.o nvdebug_entry.o KBUILD_CFLAGS += -DGIT_HASH=\"$(shell git --git-dir=$(PWD)/.git rev-parse --short HEAD)\" +# -mfentry above if not building due to mcount missing # TODO: Avoid needing to distribute NVIDIA's headers (at least they're MIT...) -#ccflags-y += -I$(PWD)/include -ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include -ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu -ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include -ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include/uapi +ccflags-y += -I$(PWD)/include +#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include +#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu +#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include +#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include/uapi all: make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules diff --git a/device_info_procfs.c b/device_info_procfs.c new file mode 100644 index 0000000..cd6c53c --- /dev/null +++ b/device_info_procfs.c @@ -0,0 +1,126 @@ +#include "nvdebug.h" +#include // For seq_* functions and types +#include // For copy_to_user() + +// Generic register printing function, used for PTOP_*_NUM registers (+more) +// @param f File being read from. `data` field is register offset to read. +// @param buf User buffer for result +// @param size Length of user buffer +// @param off Requested offset. Updated by number of characters written. +// @return -errno on error, otherwise number of bytes written to *buf +// Note: Parent `data` field MUST be the GPU index +static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, loff_t *off) { + char out[16]; + int chars_written; + struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; + if (size < 16 || *off != 0) + return 0; + // 32 bit register will always take less than 16 characters to print + chars_written = scnprintf(out, 16, "%#0x\n", nvdebug_readl(g, (uintptr_t)PDE_DATA(file_inode(f)))); + if (copy_to_user(buf, out, chars_written)) + printk(KERN_WARNING "Unable to copy all data for %s\n", file_dentry(f)->d_name.name); + *off += chars_written; + return chars_written; +} +const struct file_operations nvdebug_read_reg32_file_ops = { + .read = nvdebug_reg32_read, +}; + +//// ==v== PTOP_DEVICE_INFO ==v== //// + +// Called to start or resume a sequence. Prior to 4.19, *pos is unreliable. +// Initializes iterator `idx` state and returns it. Ends sequence on NULL. +static void* device_info_file_seq_start(struct seq_file *s, loff_t *pos) { + static int idx; + // If start of sequence, reset `idx` + if (*pos == 0) + idx = 0; + // Number of possible info entries is fixed, and list is sparse + if (idx >= NV_PTOP_DEVICE_INFO__SIZE_1) + return NULL; + return &idx; +} + +// Steps to next record. Returns new value of `idx`. +// Calls show() on non-NULL return +static void* device_info_file_seq_next(struct seq_file *s, void *idx, + loff_t *pos) { + (*pos)++; // Required by seq interface + // Number of possible info entries is fixed, and list is sparse + if ((*(int*)idx)++ >= NV_PTOP_DEVICE_INFO__SIZE_1) + return NULL; + return idx; +} + +// Print info at index *idx. Returns non-zero on error. +static int device_info_file_seq_show(struct seq_file *s, void *idx) { + ptop_device_info_t curr_info; + struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)]; + + curr_info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO(*(int*)idx)); + // Check for read errors + if (curr_info.raw == -1) + return -EIO; + + // Parse and print the data + switch(curr_info.info_type) { + case INFO_TYPE_DATA: + // As of early 2022, only the ENUM2 format of this entry exists + if (curr_info.is_not_enum2) + break; + seq_printf(s, "| BAR0 Base %#.8x\n" + "| instance %d\n", + curr_info.pri_base << 12, curr_info.inst_id); + if (curr_info.fault_id_is_valid) + seq_printf(s, "| Fault ID: %3d\n", curr_info.fault_id); + break; + case INFO_TYPE_ENUM: + if (curr_info.engine_is_valid) + seq_printf(s, "| Host's Engine ID: %2d\n", curr_info.engine_enum); + if (curr_info.runlist_is_valid) + seq_printf(s, "| Runlist ID: %2d\n", curr_info.runlist_enum); + if (curr_info.intr_is_valid) + seq_printf(s, "| Interrupt ID: %2d\n", curr_info.intr_enum); + if (curr_info.reset_is_valid) + seq_printf(s, "| Reset ID: %2d\n", curr_info.reset_enum); + break; + case INFO_TYPE_ENGINE_TYPE: + seq_printf(s, "| Engine Type: %2d (", curr_info.engine_type); + if (curr_info.engine_type < ENGINE_TYPES_LEN) + seq_printf(s, "%s)\n", ENGINE_TYPES_NAMES[curr_info.engine_type]); + else + seq_printf(s, "Unknown Engine, introduced post-Ampere)\n"); + break; + case INFO_TYPE_NOT_VALID: + default: + // Device info records are sparse, so skip unset or unknown ones + return 0; + } + + // Draw a line between each device entry + if (!curr_info.has_next_entry) + seq_printf(s, "+---------------------+\n"); + return 0; +} + +static void device_info_file_seq_stop(struct seq_file *s, void *idx) { + // No cleanup needed +} + +static const struct seq_operations device_info_file_seq_ops = { + .start = device_info_file_seq_start, + .next = device_info_file_seq_next, + .stop = device_info_file_seq_stop, + .show = device_info_file_seq_show, +}; + +static int device_info_file_open(struct inode *inode, struct file *f) { + return seq_open(f, &device_info_file_seq_ops); +} + +const struct file_operations device_info_file_ops = { + .open = device_info_file_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/mmu.c b/mmu.c new file mode 100644 index 0000000..26c7af5 --- /dev/null +++ b/mmu.c @@ -0,0 +1,251 @@ +// Helpers to deal with NVIDIA's MMU and associated page tables +#include // Kernel types + +#include "nvdebug.h" + +/* One of the oldest ways to access video memory on NVIDIA GPUs is by using + a configurable 1MB window into VRAM which is mapped into BAR0 (register) + space starting at offset NV_PRAMIN. This is still supported on NVIDIA GPUs + and appear to be used today to bootstrap page table configuration. + + Why is it mapped at a location called NVIDIA Private RAM Instance? Because + this used to point to the entirety of intance RAM, which was seperate from + VRAM on older NVIDIA GPUs. +*/ + +/* Convert a physical VRAM address to an offset in the PRAMIN window + @param addr VRAM address to convert + @return 0 on error, PRAMIN offset on success + + Note: Use off2PRAMIN() instead if you want a dereferenceable address +*/ +uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr) { + uint64_t pramin_base_va; + bar0_window_t window; + window.raw = nvdebug_readl(g, NV_PBUS_BAR0_WINDOW); + // Check if the address is valid (49 bits are addressable on-GPU) + if (addr & ~0x0001ffffffffffff) { + printk(KERN_ERR "[nvdebug] Invalid address %llx passed to %s!\n", + addr, __func__); + return 0; + } + // For unclear (debugging?) reasons, PRAMIN can point to SYSMEM + if (window.target != TARGET_VID_MEM) + return 0; + pramin_base_va = ((uint64_t)window.base) << 16; + // Protect against out-of-bounds accesses + if (addr < pramin_base_va || addr > pramin_base_va + NV_PRAMIN_LEN) + return 0; + return addr - pramin_base_va; +} + +/* NVIDIA GMMU (GPU Memory Management Unit) uses page tables that are mostly + straight-forward starting with Pascal ("page table version 2"), except for a + few quirks (like 16-byte PDE0 entries, but all other entries are 8 bytes). + + All you really need to know is that any given Page Directory Entry (PDE) + contains a pointer to the start of a 4k page densely filled with PDEs or Page + Table Entries (PTEs). + + == Page Table Refresher == + Page tables convert virtual addresses to physical addresses, and they do this + via a tree structure. Leafs (PTEs) contain a physical address, and the path + from root to leaf is defined by the virtual address. Non-leaf nodes are PDEs. + When decending, the virtual address is sliced into pieces, and one slice is + used at each level (as an index) to select the next-visited node (in level+1). + + V2 of NVIDIA's page table format uses 4 levels of PDEs and a final level of + PTEs. How the virtual address is sliced to yield an index into each level and + a page offset is shown by Fig 1. + + == Figure 1 == + Page Offset (12 bits) <---------------------------------------+ + Page Table Entry (PTE) (9 bits) <--------------------+ | + Page Directory Entry (PDE) 0 (8 bits) <-----+ | | + PDE1 (8 bits) <--------------------+ | | | + PDE2 (8 bits) <-----------+ | | | | + PDE3 (2 bits) <--+ | | | | | + ^ ^ ^ ^ ^ ^ + Virtual addr: [49, 47] [46, 38] [37, 29] [28, 21] [20, 12] [11, 0] + + The following arrays merely represent different projections of Fig. 1, and + only one is strictly needed to reconstruct all the others. However, due to + the complexity of page tables, we include all of these to aid in readability. +*/ +// How many nodes/entries per level in V2 of NVIDIA's page table format +static const int NV_MMU_PT_V2_SZ[5] = {4, 512, 512, 256, 512}; +// Size in bytes of an entry at a particular level +static const int NV_MMU_PT_V2_ENTRY_SZ[5] = {8, 8, 8, 16, 8}; +// Which bit index is the least significant in indexing each page level +static const int NV_MMU_PT_V2_LSB[5] = {47, 38, 29, 21, 12}; + +// Convert a GPU physical address to CPU virtual address via the PRAMIN window +void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy) { + return g->regs + NV_PRAMIN + vram2PRAMIN(g, phy); +} + +/* FIXME +void __iomem *off2BAR2(struct nvdebug_state* g, uint32_t off) { + return g->bar2 + off; +} +*/ + +uint64_t search_page_directory_subtree(struct nvdebug_state *g, + void __iomem *pde_offset, + void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), + uint64_t addr_to_find, + uint32_t level) { + uint64_t res, i; + void __iomem *next; + page_dir_entry_t entry; + if (level > sizeof(NV_MMU_PT_V2_SZ)) + return 0; + // Hack to workaround PDE0 being double-size and strangely formatted + if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16) + pde_offset += 8; + entry.raw = readl(pde_offset); + // If we reached an invalid (unpopulated) PDE, walk back up the tree + if (entry.target == PD_AND_TARGET_INVALID) + return 0; + // Succeed when we reach a PTE with the address we want + if (entry.is_pte) { + printk(KERN_INFO "[nvdebug] PTE for phy addr %llx (raw: %x)\n", ((u64)entry.addr) << 12, entry.raw); + return (uint64_t)entry.addr << 12 == addr_to_find; + } + printk(KERN_INFO "[nvdebug] Found PDE pointing to %llx in ap '%d' at lvl %d (raw: %x)\n", ((u64)entry.addr) << 12, entry.target, level, entry.raw); + // Depth-first search of the page table + for (i = 0; i < NV_MMU_PT_V2_SZ[level]; i++) { + next = off2addr(g, ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i); + // off2addr can fail + if (!next) { + printk(KERN_ERR "[nvdebug] %s: Unable to resolve GPU PA to CPU PA\n", __func__); + return 0; + } + res = search_page_directory_subtree(g, next, off2addr, addr_to_find, level + 1); + if (res) + return res | (i << NV_MMU_PT_V2_LSB[level + 1]); + } + return 0; +} + +/* Search a page directory of the GPU MMU + @param pde_offset Dereferenceable pointer to the start of the PDE3 entries + @param off2addr Func to converts VRAM phys addresses to valid CPU VAs + @param addr_to_find Physical address to reconstruct the virtual address of + @return 0 on error, otherwise the virtual address at which addr_to_find is + mapped into by this page table. +*/ +uint64_t search_page_directory(struct nvdebug_state *g, + void __iomem *pde_offset, + void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), + uint64_t addr_to_find) { + uint64_t res, i; + // Make sure that the query is page-aligned + if (addr_to_find & 0xfff) { + printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find); + return 0; + } + // Search the top-level page directory (PDE3) + for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++) + if ((res = search_page_directory_subtree(g, pde_offset + NV_MMU_PT_V2_ENTRY_SZ[0] * i, off2addr, addr_to_find, 0))) + return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]); + return 0; +} + +/* GMMU Page Tables Version 1 + This page table only contains 2 levels and is used in the Fermi, Kepler, and + Maxwell architectures +*/ +// Number of entries in the PDE and PTE levels +static const int NV_MMU_PT_V1_SZ[2] = {512, 1<<13}; // 2<<13 is an educated guess!!! +// Which bit index is the least significant in indexing each page level +static const int NV_MMU_PT_V1_LSB[2] = {25, 12}; // 25 is an educated guess!!! +uint64_t search_v1_page_directory(struct nvdebug_state *g, + void __iomem *pde_offset, + void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), + uint64_t addr_to_find) { + uint64_t j, i = 0; + page_dir_entry_v1_t pde; + page_tbl_entry_v1_t pte; + void __iomem *pte_offset; + // For each PDE + do { + // readq doesn't seem to work on BAR0 + pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v1_t) + 4); + pde.raw <<= 32; + pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v1_t)); + // Verify PDE is present + if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID) + continue; + // Convert to a dereferencable pointer from CPU virtual address space + pte_offset = off2addr(g, (uint64_t)pde.alt_addr << 12); + if (!pte_offset) + continue; +// printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw); +// printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.target, pde.raw); + // For each PTE + for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) { + // Don't overrun the PRAMIN window + if (pte_offset > NV_PRAMIN + g->regs + NV_PRAMIN_LEN) + return 0; + pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v1_t) + 4); + pte.raw <<= 32; + pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v1_t)); + // Skip non-present PTEs + if (!pte.is_present) + continue; +// printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw); + // If we find a matching PTE, return its virtual address + if ((uint64_t)pte.addr << 12 == addr_to_find) + return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1]; + + } + } while (++i < NV_MMU_PT_V1_SZ[0]); + return 0; +} + +/* GMMU Page Tables Version 0 + This page table only contains 2 levels and is used in the Tesla architecture +*/ +/* *** UNTESTED *** +#define NV_MMU_PT_V0_SZ 2048 +#define NV_MMU_PT_V0_LSB 29 +uint64_t search_v0_page_directory(struct nvdebug_state *g, + void __iomem *pde_offset, + void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t), + uint32_t addr_to_find) { + int j, i = 0; + page_dir_entry_v0_t pde; + page_tbl_entry_v0_t pte; + void __iomem *pte_offset; + // For each PDE + do { + // readq doesn't seem to work on BAR0 + pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4); + pde.raw <<= 32; + pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t)); + //if (pde.raw) + //printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw); + // Skip unpopulated PDEs + if (pde.type == NOT_PRESENT) + continue; + //printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12); + pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12); + // For each PTE + for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) { + pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4); + pte.raw <<= 32; + pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t)); + // Skip non-present PTEs + if (!pte.is_present) + continue; + // If we find a matching PTE, return its virtual address + //if (pte.addr != 0x5555555) + // printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present"); + if (pte.addr << 12 == addr_to_find) + return i << NV_MMU_PT_V0_LSB | j << 12; + } + } while (++i < NV_MMU_PT_V0_SZ); + return 0; // No match +} +*/ diff --git a/nvdebug.h b/nvdebug.h index 9ac71da..1882756 100644 --- a/nvdebug.h +++ b/nvdebug.h @@ -5,14 +5,18 @@ // TODO(jbakita): Don't depend on these. #include // For struct gk20a #include // For struct nvgpu_os_linux +#include // For PDE_DATA() macro /* Runlist Channel A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue of GPU commands. These commands are typically queued from userspace. - `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU - virtual address space for this context. All channels in a TSG point to the - same GPU Instance Block (?). + Prior to Volta, channels could also exist independent of a TSG. These are + called "bare channels" in the Jetson nvgpu driver. + + `INST_PTR` points to a GPU Instance Block which contains FIFO states, virtual + address space configuration for this context, and a pointer to the page + tables. All channels in a TSG point to the same GPU Instance Block (?). "RUNQUEUE_SELECTOR determines to which runqueue the channel belongs, and thereby which PBDMA will run the channel. Increasing values select @@ -30,7 +34,13 @@ ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_CHAN CHID (ID) : identifier of the channel to run (overlays ENTRY_ID) RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if - more than one PBDMA is supported by the runlist + more than one PBDMA is supported by the runlist, + additionally, "A value of 0 targets the first FE + pipe, which can process all FE driven engines: + Graphics, Compute, Inline2Memory, and TwoD. A value + of 1 targets the second FE pipe, which can only + process Compute work. Note that GRCE work is allowed + on either runqueue.)" INST_PTR_LO : lower 20 bits of the 4k-aligned instance block pointer INST_PTR_HI : upper 32 bit of instance block pointer @@ -39,6 +49,9 @@ USERD_PTR_LO : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer USERD_PTR_HI : upper 32 bits of USERD pointer USERD_TARGET (TGU) : aperture of the USERD data structure + + Channels were around since at least Fermi, but were rearranged with Volta to + add a USERD pointer, a longer INST pointer, and a runqueue selector flag. */ enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1}; enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3}; @@ -52,11 +65,12 @@ static inline char* target_to_text(enum INST_TARGET t) { return "SYS_MEM_NONCOHERENT"; default: printk(KERN_WARNING "[nvdebug] Invalid aperture!\n"); - return NULL; + return "INVALID"; } } -struct runlist_chan { +// Support: Volta, Ampere, Turing +struct gv100_runlist_chan { // 0:63 enum ENTRY_TYPE entry_type:1; uint32_t runqueue_selector:1; @@ -71,6 +85,20 @@ struct runlist_chan { uint32_t inst_ptr_hi:32; } __attribute__((packed)); +// Support: Fermi, Kepler*, Maxwell, Pascal +// *In Kepler, inst fields may be unpopulated? +struct gm107_runlist_chan { + uint32_t chid:12; + uint32_t padding0:1; + enum ENTRY_TYPE entry_type:1; + uint32_t padding1:18; + uint32_t inst_ptr_lo:20; + enum INST_TARGET inst_target:2; // Totally guessing on this + uint32_t padding2:10; +} __attribute__((packed)); + +#define gk110_runlist_chan gm107_runlist_chan + /* Runlist TSG (TimeSlice Group) The runlist is composed of timeslice groups (TSG). Each TSG corresponds to a single virtual address space on the GPU and contains `TSG_LENGTH` @@ -85,8 +113,15 @@ struct runlist_chan { TIMESLICE_TIMEOUT : timeout amount for the TSG's timeslice TSG_LENGTH : number of channels that are part of this timeslice group TSGID : identifier of the Timeslice group (overlays ENTRY_ID) + + TSGs appear to have been introduced with Kepler and stayed the same until + they were rearranged at the time of channel rearrangement to support longer + GPU instance addresses with Volta. */ -struct entry_tsg { + +// Support: Volta, Ampere*, Turing* +// *These treat the top 8 bits of TSGID as GFID (unused) +struct gv100_runlist_tsg { // 0:63 enum ENTRY_TYPE entry_type:1; uint64_t padding:15; @@ -101,14 +136,28 @@ struct entry_tsg { } __attribute__((packed)); #define MAX_TSGID (1 << 12) +// Support: Kepler (v2?), Maxwell, Pascal +// Same fields as Volta except tsg_length is 6 bits rather than 8 +// Last 32 bits appear to contain an undocumented inst ptr +struct gk110_runlist_tsg { + uint32_t tsgid:12; + uint32_t padding0:1; + enum ENTRY_TYPE entry_type:1; + uint32_t timeslice_scale:4; + uint32_t timeslice_timeout:8; + uint32_t tsg_length:6; + uint32_t padding1:32; +} __attribute__((packed)); + + enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1}; /* Preempt a TSG or Channel by ID - ID/CHID : Id of TSG or channel to preempt - IS_PENDING : ???? - TYPE : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG + ID/CHID : Id of TSG or channel to preempt + IS_PENDING : Is a context switch pending? + TYPE : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG - Support: Kepler, Maxwell, Pascal, Volta + Support: Kepler, Maxwell, Pascal, Volta, Turing */ #define NV_PFIFO_PREEMPT 0x00002634 typedef union { @@ -195,26 +244,36 @@ typedef union { */ // Note: This is different with Turing -// Support: Kepler, Maxwell, Pascal, Volta +// Support: Fermi, Kepler, Maxwell, Pascal, Volta #define NV_PFIFO_RUNLIST_BASE 0x00002270 +#define NV_PFIFO_ENG_RUNLIST_BASE(i) (0x00002280+(i)*8) typedef union { struct { uint32_t ptr:28; - uint32_t type:2; + enum INST_TARGET target:2; uint32_t padding:2; } __attribute__((packed)); uint32_t raw; } runlist_base_t; // Support: Kepler, Maxwell, Pascal, Volta +// Works on Fermi, but id is one bit longer and is b11111 #define NV_PFIFO_RUNLIST 0x00002274 +#define NV_PFIFO_ENG_RUNLIST(i) (0x00002284+(i)*8) typedef union { + // RUNLIST fields struct { uint32_t len:16; uint32_t padding:4; - uint32_t id:4; + uint32_t id:4; // Runlist ID (each engine may have a seperate runlist) uint32_t padding2:8; } __attribute__((packed)); + // ENG_RUNLIST fields that differ + struct { + uint32_t padding3:20; + bool is_pending:1; // Is runlist not yet committed? + uint32_t padding4:11; + } __attribute__((packed)); uint32_t raw; } runlist_info_t; @@ -301,63 +360,631 @@ typedef union { uint32_t raw; } runlist_disable_t; +/* Read GPU descriptors from the Master Controller (MC) + + MINOR_REVISION : Legacy (only used with Celvin in Nouveau) + MAJOR_REVISION : Legacy (only used with Celvin in Nouveau) + IMPLEMENTATION : Which implementation of the GPU architecture + ARCHITECTURE : Which GPU architecture + + CHIP_ID = IMPLEMENTATION + ARCHITECTURE << 4 + CHIP_ID : Unique ID of all chips since Kelvin + + Support: Kelvin, Rankline, Curie, Tesla, Fermi, Kepler, Maxwell, Pascal, + Volta, Turing, Ampere +*/ +#define NV_MC_BOOT_0 0x00000000 +#define NV_CHIP_ID_GP106 0x136 // Discrete GeForce GTX 1060 +#define NV_CHIP_ID_GV11B 0x15B // Jetson Xavier embedded GPU +#define NV_CHIP_ID_KEPLER 0x0E0 +#define NV_CHIP_ID_VOLTA 0x140 + +inline static const char* ARCH2NAME(uint32_t arch) { + switch (arch) { + case 0x01: + return "Celsius"; + case 0x02: + return "Kelvin"; + case 0x03: + return "Rankline"; + case 0x04: + case 0x06: // 0x06 is (nForce 6XX integrated only) + return "Curie"; + // 0x07 is unused/skipped + case 0x05: // First Tesla card was released before the nForce 6XX + case 0x08: + case 0x09: + case 0x0A: + return "Tesla"; + // 0x0B is unused/skipped + case 0x0C: + case 0x0D: + return "Fermi"; + case 0x0E: + case 0x0F: + case 0x11: + return "Kepler"; + case 0x12: + return "Maxwell"; + case 0x13: + return "Pascal"; + case 0x14: + case 0x15: // Volta integrated + return "Volta"; + case 0x16: + return "Turing"; + case 0x17: + return "Ampere"; + case 0x18: + case 0x19: + return "Hopper (?) or Lovelace (?)"; + default: + if (arch < 0x19) + return "[unknown historical architecture]"; + else + return "[future]"; + } +} + +typedef union { + // Fields as defined in the NVIDIA reference + struct { + uint32_t minor_revision:4; + uint32_t major_revision:4; + uint32_t reserved:4; + uint32_t padding0:8; + uint32_t implementation:4; + uint32_t architecture:5; + uint32_t padding1:3; + } __attribute__((packed)); + uint32_t raw; + // Arch << 4 + impl is also often used + struct { + uint32_t padding2:20; + uint32_t chip_id:9; + uint32_t padding3:3; + } __attribute__((packed)); +} mc_boot_0_t; + +enum DEVICE_INFO_TYPE {INFO_TYPE_NOT_VALID = 0, INFO_TYPE_DATA = 1, INFO_TYPE_ENUM = 2, INFO_TYPE_ENGINE_TYPE = 3}; +enum ENGINE_TYPES { + ENGINE_GRAPHICS = 0, // GRAPHICS [/compute] + ENGINE_COPY0 = 1, // [raw/physical] COPY #0 + ENGINE_COPY1 = 2, // [raw/physical] COPY #1 + ENGINE_COPY2 = 3, // [raw/physical] COPY #2 + + ENGINE_MSPDEC = 8, // Picture DECoder + ENGINE_MSPPP = 9, // [Video] Post Processing + ENGINE_MSVLD = 10, // [Video] Variable Length Decoder + ENGINE_MSENC = 11, // [Video] ENCoding + ENGINE_VIC = 12, // Video Image Compositor + ENGINE_SEC = 13, // SEquenCer [?] + ENGINE_NVENC0 = 14, // Nvidia Video ENCoder #0 + ENGINE_NVENC1 = 15, // Nvidia Video ENCoder #1 + ENGINE_NVDEC = 16, // Nvidia Video DECoder + + ENGINE_IOCTRL = 18, // I/O ConTRoLler [of NVLINK at least] + ENGINE_LCE = 19, // Logical Copy Engine + ENGINE_GSP = 20, // Gpu System Processor + ENGINE_NVJPG = 21, // NVidia JPeG [Decoder] (Ampere+) +}; +#define ENGINE_TYPES_LEN 22 +static const char* const ENGINE_TYPES_NAMES[ENGINE_TYPES_LEN] = { + "Graphics/Compute", + "COPY0", + "COPY1", + "COPY2", + "Unknown Engine ID#4", + "Unknown Engine ID#5", + "Unknown Engine ID#6", + "Unknown Engine ID#7", + "MSPDEC: Picture Decoder", + "MSPPP: Post Processing", + "MSVLD: Variable Length Decoder", + "MSENC: Encoder", + "VIC: Video Image Compositor", + "SEC: Sequencer", + "NVENC0: NVIDIA Video Encoder #0", + "NVENC1: NVIDIA Video Encoder #1", + "NVDEC: NVIDIA Video Decoder", + "Unknown Engine ID#17", + "IOCTRL: I/O Controller", + "LCE: Logical Copy Engine", + "GSP: GPU System Processor", + "NVJPG: NVIDIA JPEG Decoder", +}; + +/* GPU engine information and control register offsets + Each engine is described by one or more entries (terminated by an entry with + the `has_next_entry` flag unset) in the fixed-size PTOP_DEVICE_INFO table. A + typical device, such as the graphics/compute engine and any copy engines, are + described by three entries, one of each type. + + The PTOP_DEVICE_INFO table is sparsely populated (entries of type + INFO_TYPE_NOT_VALID may be intermingled with valid entries), so any traversal + code should check all NV_PTOP_DEVICE_INFO__SIZE_1 entries and not terminate + upon reaching the first entry of INFO_TYPE_NOT_VALID. + + INFO_TYPE : Is this a DATA, ENUM, or ENGINE_TYPE table entry? + HAS_NEXT_ENTRY : Does the following entry refer to the same engine? + + == INFO_TYPE_DATA fields == + PRI_BASE : BAR0 base = (PRI_BASE << 12) aka 4k aligned. + INST_ID : "Note that some instanced [engines] (such as logical copy + engines aka LCE) share a PRI_BASE across all [engines] of + the same engine type; such [engines] require an additional + offset: instanced base = BAR0 base + stride * INST_ID. + FAULT_ID_IS_VALID : Does this engine have its own bind point and fault ID + with the MMU? + FAULT_ID : "The MMU fault id used by this [engine]. These IDs + correspond to the NV_PFAULT_MMU_ENG_ID define list." + + == INFO_TYPE_ENUM fields == + ENGINE_IS_VALID : Is this engine a host engine? + ENGINE_ENUM : "[T]he host engine ID for the current [engine] if it is + a host engine, meaning Host can send methods to the + engine. This id is used to index into any register array + whose __SIZE_1 is equal to NV_HOST_NUM_ENGINES. A given + ENGINE_ENUM can be present for at most one device in the + table. Devices corresponding to all ENGINE_ENUM ids 0 + through NV_HOST_NUM_ENGINES - 1 must be present in the + device info table." + RUNLIST_IS_VALID : Is this engine a host engine with a runlist? + RUNLIST_ENUM : "[T]he Host runlist ID on which methods for the current + [engine] should be submitted... The runlist id is used to + index into any register array whose __SIZE_1 is equal to + NV_HOST_NUM_RUNLISTS. [Engines] corresponding to all + RUNLIST_ENUM ids 0 through NV_HOST_NUM_RUNLISTS - 1 must + be present in the device info table." + INTR_IS_VALID : Does this device have an interrupt? + INTR_ENUM : Interrupt ID for use with "the NV_PMC_INTR_*_DEVICE + register bitfields." + RESET_IS_VALID : Does this engine have a reset ID? + RESET_ENUM : Reset ID for use indexing the "NV_PMC_ENABLE_DEVICE(i) + and NV_PMC_ELPG_ENABLE_DEVICE(i) register bitfields." + + == INFO_TYPE_ENGINE_TYPE fields == + ENGINE_TYPE : What type of engine is this? (see ENGINE_TYPES_NAMES) + + Support: Kepler, Maxwell, Pascal, Volta, Ampere + See dev_top.ref.txt of NVIDIA's open-gpu-doc for more info. +*/ +#define NV_PTOP_DEVICE_INFO(i) (0x00022700+(i)*4) +#define NV_PTOP_DEVICE_INFO__SIZE_1 64 +typedef union { + // DATA type fields + struct { + enum DEVICE_INFO_TYPE info_type:2; + bool fault_id_is_valid:1; + uint32_t fault_id:7; + uint32_t padding0:2; + uint32_t pri_base:12; + uint32_t padding1:2; + uint32_t inst_id:4; + uint32_t is_not_enum2:1; + bool has_next_entry:1; + } __attribute__((packed)); + // ENUM type fields + struct { + uint32_t padding2:2; + bool reset_is_valid:1; + bool intr_is_valid:1; + bool runlist_is_valid:1; + bool engine_is_valid:1; + uint32_t padding3:3; + uint32_t reset_enum:5; + uint32_t padding4:1; + uint32_t intr_enum:5; + uint32_t padding5:1; + uint32_t runlist_enum:4; + uint32_t padding6:1; + uint32_t engine_enum:4; + uint32_t padding7:2; + } __attribute__((packed)); + // ENGINE_TYPE type fields + struct { + uint32_t padding8:2; + enum ENGINE_TYPES engine_type:29; + uint32_t padding9:1; + } __attribute__((packed)); + uint32_t raw; +} ptop_device_info_t; + +#define NV_PTOP_SCAL_NUM_GPCS 0x00022430 +#define NV_PTOP_SCAL_NUM_TPC_PER_GPC 0x00022434 +#define NV_PTOP_SCAL_NUM_CES 0x00022444 +// PCE_MAP is Volta+ only +#define NV_CE_PCE_MAP 0x00104028 + +// GPC and TPC masks +// Support: Maxwell+ +#define NV_FUSE_GPC 0x00021c1c +#define NV_FUSE_TPC_FOR_GPC(i) (0x00021c38+(i)*4) + +/* Location of the 1Kb instance block with page tables for BAR1 and BAR2. + Support: Fermi+ (?), Pascal +*/ +#define NV_PBUS_BAR1_BLOCK 0x00001704 +#define NV_PBUS_BAR2_BLOCK 0x00001714 +typedef union { + struct { + uint32_t ptr:28; + enum INST_TARGET target:2; + uint32_t padding0:1; + bool is_virtual:1; + } __attribute__((packed)); + uint32_t raw; + struct { + uint32_t map:30; + uint32_t padding1:2; + } __attribute__((packed)); +} bar_config_block_t; + +/* BAR0 PRAMIN (Private RAM Instance) window configuration + + BASE : Base of window >> 16 in [TARGET] virtual address space + TARGET : Which address space BASE points into + + Note: This seems to be set to 0x0bff00000 - 0x0c0000000 at least sometimes + + Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere +*/ +#define NV_PBUS_BAR0_WINDOW 0x00001700 +#define NV_PRAMIN 0x00700000 // Goes until 0x00800000 (1MB window) +#define NV_PRAMIN_LEN 0x00100000 +typedef union { + struct { + uint32_t base:24; + enum INST_TARGET target:2; + uint32_t padding0:6; + } __attribute__((packed)); + uint32_t raw; +} bar0_window_t; + +// Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere +#define NV_PRAMIN_PDB_CONFIG_OFF 0x200 +typedef union { + struct { + uint32_t target:2; + uint32_t vol:1; + uint32_t padding0:1; + uint32_t fault_replay_tex:1; + uint32_t fault_replay_gcc:1; + uint32_t padding1:4; + bool is_ver2:1; + bool is_64k_big_page:1; // 128Kb otherwise + uint32_t page_dir_lo:20; + uint32_t page_dir_hi:32; + } __attribute__((packed)); + uint64_t raw; +} page_dir_config_t; + +/* Page directory entry + + Note: Format changed with Pascal (how?) + + Support: Pascal, Volta, Turing, Ampere +*/ +// FIXME: PDE/PTEs are actually 64 bits =S +// Important: Aperture keys are different with PDEs +enum PD_TARGET { + PD_AND_TARGET_INVALID = 0, // b000 + PD_AND_TARGET_VID_MEM = 2, // b010 + PD_AND_TARGET_SYS_MEM_COHERENT = 4, // b100 + PD_AND_TARGET_SYS_MEM_NONCOHERENT = 6, // b110 + PTE_AND_TARGET_VID_MEM = 1, // b001 + PTE_AND_TARGET_PEER = 3, // b011 + PTE_AND_TARGET_SYS_MEM_COHERENT = 5, // b101 + PTE_AND_TARGET_SYS_MEM_NONCOHERENT = 7, // b111 +}; +static inline char* pd_target_to_text(enum PD_TARGET t) { + switch (t) { + case PD_AND_TARGET_INVALID: + return "INVALID"; + case PD_AND_TARGET_VID_MEM: + case PTE_AND_TARGET_VID_MEM: + return "VID_MEM"; + case PTE_AND_TARGET_PEER: + return "PEER"; + case PD_AND_TARGET_SYS_MEM_COHERENT: + case PTE_AND_TARGET_SYS_MEM_COHERENT: + return "SYS_MEM_COHERENT"; + case PD_AND_TARGET_SYS_MEM_NONCOHERENT: + case PTE_AND_TARGET_SYS_MEM_NONCOHERENT: + return "SYS_MEM_NONCOHERENT"; + default: + printk(KERN_WARNING "[nvdebug] Invalid aperture!\n"); + return NULL; + } +} + +// PDE/PTE V2 type +// Note: As the meaning of target (bits 2:1) changes depending on if the entry +// is a PTE or not, this combines them into a single target field to +// simplify comparisons. +// Support: Pascal, Turing, Ampere +typedef union { + // Page Directory Entry (PDE) + struct { + bool is_pte:1; + uint32_t __target:2; + bool is_volatile:1; + uint32_t padding1:4; + uint32_t addr:24; + } __attribute__((packed)); + // Page Table Entry (PTE) + struct { + enum PD_TARGET target:3; + uint32_t __is_volatile:1; + bool is_encrypted:1; + bool is_privileged:1; + bool is_readonly:1; + bool atomics_disabled:1; + uint32_t __addr:24; + } __attribute__((packed)); + uint32_t raw; +} page_dir_entry_t; + +// PDE/PTE V1 types +// Support: Fermi, Kepler, Maxwell +enum V1_PD_TARGET { + PD_TARGET_INVALID = 0, + PD_TARGET_VID_MEM = 1, + PD_TARGET_SYS_MEM_COHERENT = 2, + PD_TARGET_SYS_MEM_NONCOHERENT = 3, +}; +// Page Directory Entry (PDE) +typedef union { +// Large page fields + struct { +// 0:32 + enum V1_PD_TARGET target:2; + uint32_t padding0:2; + uint64_t addr:28; // May be wider? +// 32:63 + uint32_t padding2:3; + uint32_t is_volatile:1; // Might have counted wrong? + uint32_t padding3:28; + } __attribute__((packed)); +// Small page fields + struct { +// 0:32 + uint32_t padding00:32; +// 32:63 + enum V1_PD_TARGET alt_target:2; + uint32_t alt_is_volatile:1; // Might have counted wrong? + uint32_t padding03:1; + uint64_t alt_addr:28; + } __attribute__((packed)); + uint64_t raw; +} page_dir_entry_v1_t; +// Page Table Entry (PTE) +// Reconstructed from info in Jetson nvgpu driver +typedef union { + struct { +// 0:32 + bool is_present:1; + bool is_privileged:1; + bool is_readonly:1; + uint32_t padding0:1; + uint64_t addr:28; +// 32:63 + bool is_volatile:1; + enum INST_TARGET:2; + uint32_t padding1:1; + uint32_t kind:8; + uint32_t comptag:17; + uint32_t padding2:1; + bool is_read_disabled:1; + bool is_write_disabled:1; + } __attribute__((packed)); + uint64_t raw; +} page_tbl_entry_v1_t; +//enum V0_PDE_TYPE {NOT_PRESENT = 0, PAGE_64K = 1, PAGE_16K = 2, PAGE_4K = 3}; +//enum V0_PDE_SIZE {PDE_SZ_128K = 0, PDE_SZ_32K = 1, PDE_SZ_16K = 2, PDE_SZ_8K = 3}; +//static const int V0_PDE_SIZE2NUM[4] = {128*1024, 32*1024, 16*1024, 8*1024}; +/* PDE V0 (nv50/Tesla) +typedef union { + struct { + enum V1_PDE_TYPE type:2; + enum INST_TARGET target:2; + uint32_t padding0:1; + enum V1_PDE_SIZE sublevel_size:2; + uint32_t padding1:5; + uint32_t addr:28; + uint32_t padding2:24; + } __attribute__((packed)); + uint64_t raw; +} page_dir_entry_v1_t;*/ +/* PTE V0 (nv50) +typedef union { + struct { + bool is_present:1; + uint32_t padding3:2; + bool is_readonly:1; + enum INST_TARGET target:2; + bool is_privileged:1; + uint32_t contig_blk_sz:3; + uint32_t padding4:2; + uint32_t addr:28; + uint32_t storage_type:7; // ??? + uint32_t compression_mode:2; // ??? + uint32_t compression_tag:12; // ??? + bool is_long_partition_cycle:1; // ??? + bool is_encrypted:1; + uint32_t padding5:1; + } __attribute__((packed)); + uint64_t raw; +} page_tbl_entry_v1_t;*/ + // TODO(jbakita): Maybe put the above GPU types in a different file. -#define for_chan_in_tsg(chan, tsg) \ - for (chan = (struct runlist_chan*)(tsg + 1); \ - (void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \ - chan++) +#define NV_PCI_VENDOR 0x10de +struct nvdebug_state { + // Pointer to the mapped base address of the GPU control registers (obtained + // via ioremap() originally). For embedded GPUs, we extract this from their + // struct nvgpu_os_linux. For discrete GPUs, we create our own mapping of + // BAR0 with pci_iomap(). Access via nvgpu_readl/writel functions. + void __iomem *regs; + // Depending on the architecture, BAR2 or BAR3 are used to access PRAMIN + union { + void __iomem *bar2; + void __iomem *bar3; + }; + int chip_id; + // Additional state from the built-in driver. Only set iff + // chip_id == NV_CHIP_ID_GV11B + struct gk20a *g; + // Pointer to PCI device needed for pci_iounmap + struct pci_dev *pcid; +}; + +/*const struct runlist_funcs { + u8 size; + enum ENTRY_TYPE (*entry_type)(struct nvdebug_state *, void *); + uint32_t (*chid)(struct nvdebug_state *, void *); + uint32_t (*inst_ptr_lo)(struct nvdebug_state *, void *); + enum INST_TARGET (*inst_target)(struct nvdebug_state *, void *): + uint32_t (*tsgid)(struct nvdebug_state *, void *); + uint32_t (*timeslice_scale)(struct nvdebug_state *, void *); + uint32_t (*timeslice_timeout)(struct nvdebug_state *, void *); + uint32_t (*tsg_length)(struct nvdebug_state *, void *); +};*/ + +// This disgusting macro is a crutch to work around the fact that runlists were +// different prior to Volta. +#define VERSIONED_RL_ACCESSOR(_ENTRY_TYPE, type, prop) \ + __attribute__((unused)) \ + static type (prop)(const struct nvdebug_state *g, const void *raw) { \ + if (g->chip_id > NV_CHIP_ID_VOLTA) { \ + const struct gv100_runlist_ ## _ENTRY_TYPE *entry = (struct gv100_runlist_ ## _ENTRY_TYPE*)raw; \ + return entry->prop; \ + } else if (g->chip_id > NV_CHIP_ID_KEPLER) { \ + const struct gk110_runlist_ ## _ENTRY_TYPE *entry = (struct gk110_runlist_ ## _ENTRY_TYPE*)raw; \ + return entry->prop; \ + } else { \ + printk(KERN_WARNING "[nvdebug] " #prop " unavailable on GPU ID %x, which is older than Kepler.\n", g->chip_id); \ + return (type)0; \ + } \ + } + +VERSIONED_RL_ACCESSOR(chan, uint32_t, chid); +VERSIONED_RL_ACCESSOR(chan, uint32_t, inst_ptr_lo); +VERSIONED_RL_ACCESSOR(chan, enum INST_TARGET, inst_target); +VERSIONED_RL_ACCESSOR(tsg, uint32_t, tsgid); +VERSIONED_RL_ACCESSOR(tsg, enum ENTRY_TYPE, entry_type); +VERSIONED_RL_ACCESSOR(tsg, uint32_t, timeslice_scale); +VERSIONED_RL_ACCESSOR(tsg, uint32_t, timeslice_timeout); +VERSIONED_RL_ACCESSOR(tsg, uint32_t, tsg_length); -#define next_tsg(tsg) \ - (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length + +#define NV_RL_ENTRY_SIZE(g) \ + ((g)->chip_id >= NV_CHIP_ID_VOLTA ? sizeof(struct gv100_runlist_tsg) : sizeof(struct gk110_runlist_tsg)) + +#define for_chan_in_tsg(g, chan, tsg) \ + for (chan = (typeof(chan))(((u8*)tsg) + NV_RL_ENTRY_SIZE(g)); \ + (u8*)chan < ((u8*)tsg) + (1 + tsg_length(g, tsg)) * NV_RL_ENTRY_SIZE(g); \ + chan = (typeof(chan))(((u8*)chan) + NV_RL_ENTRY_SIZE(g))) + +#define next_tsg(g, tsg) \ + (typeof(tsg))((u8*)(tsg) + NV_RL_ENTRY_SIZE(g) * (tsg_length(g, tsg) + 1)) struct runlist_iter { - struct entry_tsg *curr_tsg; + // Pointer to either a TSG or channel entry (they're the same size) + void *curr_entry; + // This should be set to tsg_length when a TSG is reached, and + // decremented as each subsequent channel is printed. This allows us to + // track which channel are and are not part of the TSG. + int channels_left_in_tsg; + // Total runlist length, etc runlist_info_t rl_info; }; +#define NVDEBUG_MAX_DEVICES 8 +extern struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES]; + // Defined in runlist.c -struct gk20a* get_live_gk20a(void); -int get_runlist_iter(struct runlist_iter *rl_iter); -int preempt_tsg(uint32_t tsg_id); +int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl_iter); +int preempt_tsg(struct nvdebug_state *g, uint32_t tsg_id); + +// Defined in mmu.c +uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr); +void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy); +uint64_t search_page_directory( + struct nvdebug_state *g, + void __iomem *pde_offset, + void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), + uint64_t addr_to_find); +uint64_t search_v1_page_directory( + struct nvdebug_state *g, + void __iomem *pde_offset, + void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t), + uint64_t addr_to_find); + static inline struct gk20a *get_gk20a(struct device *dev) { // XXX: Only works because gk20a* is the first member of gk20a_platform return *((struct gk20a**)dev_get_drvdata(dev)); } -// Functionally identical to nvgpu_readl() +// We us the data field of the proc_dir_entry ("PDE" in this function) to store +// our index into the g_nvdebug_state array +static inline int seq2gpuidx(struct seq_file *s) { + const struct file *f = s->file; + return (uintptr_t)PDE_DATA(file_inode(f)); +} +static inline int file2gpuidx(const struct file *f) { + return (uintptr_t)PDE_DATA(file_inode(f)); +} +static inline int file2parentgpuidx(const struct file *f) { + // Should be safe to call on ProcFS entries, as our parent should (?) + // still exist if we're called. If not, there are worse races in this + // module. + return (uintptr_t)PDE_DATA(file_dentry(f)->d_parent->d_inode); +} + +#define gk20a_regs(gk20a) (container_of(gk20a, struct nvgpu_os_linux, g)->regs) + +// Similar to nvgpu_readl() // (except we don't try to resolve situations where regs is NULL) -static inline u32 nvdebug_readl(struct gk20a* g, u32 r) { - struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g); - if (unlikely(!g_os->regs)) { - printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n"); - return -1; - } - return readl(g_os->regs + r); +static inline u32 nvdebug_readl(struct nvdebug_state *s, u32 r) { + if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) { + printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n"); + return -1; + } + return readl(s->regs + r); } // quadword version of nvdebug_readl() -static inline u64 nvdebug_readq(struct gk20a* g, u32 r) { - struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g); - u64 ret; - if (unlikely(!g_os->regs)) { - printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n"); - return -1; - } +static inline u64 nvdebug_readq(struct nvdebug_state *s, u32 r) { + u64 ret; + if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) { + printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n"); + return -1; + } // readq seems to always return the uppermost 32 bits as 0, so workaround with readl - ret = readl(g_os->regs + r); - ret |= ((u64)readl(g_os->regs + r + 4)) << 32; + ret = readl(s->regs + r); + ret |= ((u64)readl(s->regs + r + 4)) << 32; return ret; } -// Functionally identical to nvgpu_writel() -static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) { - struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g); - if (unlikely(!g_os->regs)) { +// Similar to nvgpu_writel() +static inline void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) { + if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) { + printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n"); + return; + } + writel_relaxed(v, s->regs + r); + wmb(); +} + +// quadword version of nvdebug_writel() +// XXX: This probably doesn't work XXX: Untested +static inline void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) { + if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) { printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n"); return; } - writel_relaxed(v, g_os->regs + r); + writeq_relaxed(v, s->regs + r); wmb(); } diff --git a/nvdebug_entry.c b/nvdebug_entry.c index 0854b8b..695b5fd 100644 --- a/nvdebug_entry.c +++ b/nvdebug_entry.c @@ -2,64 +2,282 @@ * SPDX-License-Identifier: MIT */ -/* TODO - * - Add sysfs trigger for a preemption - */ - #include // For struct device, bus_find_device*(), struct bus_type +#include // For hooking the nvidia driver interrupts #include #include -#include // So we can set up entries in /proc +#include // For PCI device scanning +#include // So we can set up entries in /proc #include "nvdebug.h" +#include "stubs.h" -// LIAR. But without this we can't use GPL-only exported symbols like +// MIT is GPL-compatible. We need to be GPL-compatible for symbols like // platform_bus_type or bus_find_device_by_name... -MODULE_LICENSE("GPL"); +MODULE_LICENSE("Dual MIT/GPL"); MODULE_AUTHOR("Joshua Bakita"); MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs"); -MODULE_SOFTDEP("pre: nvgpu"); // We only support the Jetson boards for now extern const struct file_operations runlist_file_ops; extern const struct file_operations preempt_tsg_file_ops; extern const struct file_operations disable_channel_file_ops; extern const struct file_operations enable_channel_file_ops; extern const struct file_operations switch_to_tsg_file_ops; +extern const struct file_operations device_info_file_ops; +extern const struct file_operations nvdebug_read_reg32_file_ops; + +// Bus types are global symbols in the kernel +extern struct bus_type platform_bus_type; +struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES]; +unsigned int g_nvdebug_devices = 0; + +// TEMP +irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) { + printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num); + return IRQ_NONE; // We don't actually handle any interrupts. Pass them on. +} + +// Find any and all NVIDIA GPUs in the system +// Note: This function fails if any of them are in a bad state +int probe_and_cache_device(void) { + // platform bus (SoC) iterators + struct device *dev = NULL; + struct device *temp_dev; + // PCI search iterator and search query + struct pci_dev *pcid = NULL; + // This query pattern is mirrored off nouveau + struct pci_device_id query = { + .vendor = NV_PCI_VENDOR, // Match NVIDIA devices + .device = PCI_ANY_ID, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .class_mask = 0xff << 16, + .class = PCI_BASE_CLASS_DISPLAY << 16, // Match display devs + }; + int i = 0; + // Search the platform bus for the first device that matches our name + // Search for GV10B (Jetson Xavier) + while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b"))) + dev = temp_dev; + // Search for GP10B (Jetson TX2) + while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gp10b"))) + dev = temp_dev; + // TODO: Support other platform bus devices (gk20a, gm20b) + if (dev) { + struct nvgpu_os_linux *l; + mc_boot_0_t ids; + g_nvdebug_state[i].g = get_gk20a(dev); + l = container_of(g_nvdebug_state[i].g, struct nvgpu_os_linux, g); + g_nvdebug_state[i].regs = l->regs; + if (!g_nvdebug_state[i].regs) + return -EADDRNOTAVAIL; + ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0); + if (ids.raw == -1) + return -EADDRNOTAVAIL; + g_nvdebug_state[i].chip_id = ids.chip_id; + printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on platform bus and initialized.", + ids.chip_id, ARCH2NAME(ids.architecture)); + i++; + } + // Search the PCI bus and iterate through all matches + // FIXME: State rollback + while ((pcid = pci_get_dev_by_id(&query, pcid)) && i < NVDEBUG_MAX_DEVICES) { + mc_boot_0_t ids; + g_nvdebug_state[i].g = NULL; + // Map BAR0 (GPU control registers) + g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0); + if (!g_nvdebug_state[i].regs) { + pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n"); + return -EADDRNOTAVAIL; + } + // Map BAR3 (CPU-accessible mappings of GPU DRAM) + g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, 0); + // Try mapping only the lower half of BAR3 on fail + // (vesafb may map the top half for display) + if (!g_nvdebug_state[i].bar3) + g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2); + g_nvdebug_state[i].pcid = pcid; + ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0); + if (ids.raw == -1) { + pci_err(pcid, "[nvdebug] Unable to read config from Master Controller on this GPU\n"); + return -EADDRNOTAVAIL; + } + g_nvdebug_state[i].chip_id = ids.chip_id; + printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.", + ids.chip_id, ARCH2NAME(ids.architecture)); + // TEMP + if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) { + printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n"); + } + i++; + } + // Return the number of devices we found + if (i > 0) + return i; + return -ENODEV; +} + +// Create files `/proc/gpu#/runlist#`, world readable +int create_runlist_files(int device_id, struct proc_dir_entry *dir) { + ptop_device_info_t info; + struct proc_dir_entry *rl_entry; + int i, rl_id; + char runlist_name[12]; + int max_rl_id = 0; // Always at least one runlist + // Figure out how many runlists there are by checking the device info + // registers. Runlists are always numbered sequentially, so we just have + // to find the highest-valued one and add 1 to get the number of runlists. + for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1; i++) { + info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO(i)); + if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid) + continue; + if (info.runlist_enum > max_rl_id) + max_rl_id = info.runlist_enum; + } + // Create files to read each runlist. The read handling code looks at the + // PDE_DATA associated with the file to determine what the runlist ID is. + for (rl_id = 0; rl_id <= max_rl_id; rl_id++) { + snprintf(runlist_name, 12, "runlist%d", rl_id); + rl_entry = proc_create_data( + runlist_name, 0444, dir, &runlist_file_ops, + (void*)(uintptr_t)rl_id); + if (!rl_entry) + return -ENOMEM; + } + return 0; +} + +// Create files /proc/gpu# +// TODO: Don't run this on unsupported GPUs +int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) { + char file_name[20]; + int i; + struct proc_dir_entry *gpc_tpc_mask_entry; + // Get a bitmask of which GPCs are disabled + uint32_t gpcs_mask = nvdebug_readl(&g_nvdebug_state[device_id], NV_FUSE_GPC); + // Get maximum number of enabled GPCs for this chip + uint32_t max_gpcs = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_SCAL_NUM_GPCS); + // For each enabled GPC, expose a mask of disabled TPCs + for (i = 0; i < max_gpcs; i++) { + // Do nothing if GPC is disabled + if ((1 << i) & gpcs_mask) + continue; + // If GPC is enabled, create an entry to read disabled TPCs mask + snprintf(file_name, 20, "gpc%d_tpc_mask", i); + gpc_tpc_mask_entry = proc_create_data( + file_name, 0444, dir, &nvdebug_read_reg32_file_ops, + (void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC(i)); + if (!gpc_tpc_mask_entry) + return -ENOMEM; + } + return 0; +} int __init nvdebug_init(void) { - struct proc_dir_entry *rl_entry, *preempt_entry, *disable_channel_entry, - *enable_channel_entry, *switch_to_tsg_entry; - // Create file `/proc/preempt_tsg`, world readable - rl_entry = proc_create("runlist", 0444, NULL, &runlist_file_ops); - // Create file `/proc/preempt_tsg`, world writable - preempt_entry = proc_create("preempt_tsg", 0222, NULL, &preempt_tsg_file_ops); - // Create file `/proc/disable_channel`, world writable - disable_channel_entry = proc_create("disable_channel", 0222, NULL, &disable_channel_file_ops); - // Create file `/proc/enable_channel`, world writable - enable_channel_entry = proc_create("enable_channel", 0222, NULL, &enable_channel_file_ops); - // Create file `/proc/switch_to_tsg`, world writable - switch_to_tsg_entry = proc_create("switch_to_tsg", 0222, NULL, &switch_to_tsg_file_ops); - // ProcFS entry creation only fails if out of memory - if (!rl_entry || !preempt_entry || !disable_channel_entry || !enable_channel_entry || !switch_to_tsg_entry) { - remove_proc_entry("runlist", NULL); - remove_proc_entry("preempt_tsg", NULL); - remove_proc_entry("disable_channel", NULL); - remove_proc_entry("enable_channel", NULL); - remove_proc_entry("switch_to_tsg", NULL); - printk(KERN_ERR "[nvdebug] Unable to initialize procfs entries!\n"); - return -ENOMEM; + struct proc_dir_entry *dir, *preempt_entry, *disable_channel_entry, + *enable_channel_entry, *switch_to_tsg_entry, *device_info_entry, + *num_gpcs_entry; + int rl_create_err, tpc_masks_create_err; + // Check that an NVIDIA GPU is present and initialize g_nvdebug_state + int res = probe_and_cache_device(); + if (res < 0) + return res; + g_nvdebug_devices = res; + // Create seperate ProcFS directories for each gpu + while (res--) { + char device_id_str[7]; + uintptr_t device_id = res; // This is uintptr as we abuse the *data field on proc_dir_entry to store the GPU id + // Create directory /proc/gpu# where # is the GPU number + snprintf(device_id_str, 7, "gpu%ld", device_id); + if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id))) + goto out_nomem; + // Create files `/proc/gpu#/runlist#`, world readable + rl_create_err = create_runlist_files(device_id, dir); + // Create files `/proc/gpu#/gpc#_tpc_mask`, world readable + tpc_masks_create_err = create_tpc_mask_files(device_id, dir); + // Create file `/proc/gpu#/preempt_tsg`, world writable + preempt_entry = proc_create_data( + "preempt_tsg", 0222, dir, &preempt_tsg_file_ops, + (void*)device_id); + // Create file `/proc/gpu#/disable_channel`, world writable + disable_channel_entry = proc_create_data( + "disable_channel", 0222, dir, &disable_channel_file_ops, + (void*)device_id); + // Create file `/proc/gpu#/enable_channel`, world writable + enable_channel_entry = proc_create_data( + "enable_channel", 0222, dir, &enable_channel_file_ops, + (void*)device_id); + // Create file `/proc/gpu#/switch_to_tsg`, world writable + switch_to_tsg_entry = proc_create_data( + "switch_to_tsg", 0222, dir, &switch_to_tsg_file_ops, + (void*)device_id); + // Create file `/proc/gpu#/device_info`, world readable + device_info_entry = proc_create_data( + "device_info", 0444, dir, &device_info_file_ops, + (void*)device_id); + // Create file `/proc/gpu#/num_gpcs`, world readable + num_gpcs_entry = proc_create_data( + "num_gpcs", 0444, dir, &nvdebug_read_reg32_file_ops, + (void*)NV_PTOP_SCAL_NUM_GPCS); + // Create file `/proc/gpu#/num_tpc_per_gpc`, world readable + num_gpcs_entry = proc_create_data( + "num_tpc_per_gpc", 0444, dir, &nvdebug_read_reg32_file_ops, + (void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC); + // Create file `/proc/gpu#/num_ces`, world readable + num_gpcs_entry = proc_create_data( + "num_ces", 0444, dir, &nvdebug_read_reg32_file_ops, + (void*)NV_PTOP_SCAL_NUM_CES); + // Create file `/proc/gpu#/num_ces`, world readable + num_gpcs_entry = proc_create_data( + "gpc_mask", 0444, dir, &nvdebug_read_reg32_file_ops, + (void*)NV_FUSE_GPC); + // In both nouveau and nvgpu, the PCE_MAP register is only available on Volta+ + if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_VOLTA) { + // TODO: Redo to num_pces + // Create file `/proc/gpu#/pce_map`, world readable + num_gpcs_entry = proc_create_data( + "pce_map", 0444, dir, &nvdebug_read_reg32_file_ops, + (void*)NV_CE_PCE_MAP); + } + // ProcFS entry creation only fails if out of memory + if (rl_create_err || tpc_masks_create_err || !preempt_entry || + !disable_channel_entry || !enable_channel_entry || + !switch_to_tsg_entry || !device_info_entry || !num_gpcs_entry) + goto out_nomem; } + // (See Makefile if you want to know the origin of GIT_HASH.) printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n"); return 0; +out_nomem: + // Make sure to clear all ProcFS directories on error + while (res < g_nvdebug_devices) { + char device_id_str[7]; + snprintf(device_id_str, 7, "gpu%d", res); + remove_proc_subtree(device_id_str, NULL); + res++; + } + return -ENOMEM; } static void __exit nvdebug_exit(void) { - remove_proc_entry("runlist", NULL); - remove_proc_entry("preempt_tsg", NULL); - remove_proc_entry("disable_channel", NULL); - remove_proc_entry("enable_channel", NULL); - remove_proc_entry("switch_to_tsg", NULL); - printk(KERN_INFO "[nvdebug] Exiting...\n"); + struct nvdebug_state *g; + // Deinitialize each device + while (g_nvdebug_devices--) { + // Remove procfs directory + char device_id[7]; + snprintf(device_id, 7, "gpu%d", g_nvdebug_devices); + remove_proc_subtree(device_id, NULL); + // Free BAR mappings + g = &g_nvdebug_state[g_nvdebug_devices]; + if (g && g->regs) + pci_iounmap(g->pcid, g->regs); + if (g && g->bar2) + pci_iounmap(g->pcid, g->bar2); + // TEMP + free_irq(g->pcid->irq, g->pcid); + printk(KERN_INFO "[nvdebug] Chip ID %x deinitialized.", g->chip_id); + } + printk(KERN_INFO "[nvdebug] Module exit complete.\n"); } module_init(nvdebug_init); diff --git a/runlist.c b/runlist.c index c8ff99f..94be18e 100644 --- a/runlist.c +++ b/runlist.c @@ -1,122 +1,127 @@ -#include // For struct device, bus_find_device*(), struct bus_type -//#include // For struct iommu_domain #include // Kernel types -#include #include "nvdebug.h" -// Bus types are global symbols in the kernel -extern struct bus_type platform_bus_type; - -struct gk20a* get_live_gk20a(void) { - struct device *dev = NULL; - struct device *temp_dev; - struct gk20a *g; - struct nvgpu_os_linux *l; - // Get the last device that matches our name - while ((temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b"))) { - dev = temp_dev; - printk(KERN_INFO "[nvdebug] Found a matching device %s\n", dev_name(dev)); - } - if (!dev) - return NULL; - g = get_gk20a(dev); - // The address pointed to `regs` + NV_PFIFO_RUNLIST_BASE seems to not be: - // - A GPU address (type is sysmem_coherent) - // - A physical address (dereferencing after ioremap crashes) - // - A kernel virtual address (dereferencing segfaults) - // So maybe it's some sort of custom thing? This is an address that the GPU - // can use, so it would make most sense for it to be a physical address. - // - // BUT, it can't possibly be a physical address, as it would refer to an - // address greater than the maximum one on our system (by a lot!). - // Maybe I'm reading the runlist base wrong? - // Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual - // address! So, what's this I/O address space? All I know is that it's what - // nvgpu_mem_get_addr() returns. That function returns the result of either: - // - gpu_phys_addr which is __nvgpu_sgl_phys on our platform which (?) - // converts an IPA to a PA? - // - nvgpu_mem_iommu_translate - // - // The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which - // returns SYSMEM. - // - // To convert a physical address to a IOMMU address, we add a bit - // - // BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working - // before because the GPU had simply gone to sleep and invalidated its - // register state, so nvgpu_readl() was simply returning garbage. - l = container_of(g, struct nvgpu_os_linux, g); - if (!l->regs) - return NULL; - return g; -} - /* Get runlist head and info (incl. length) @param rl_iter Location at which to store output + @param rl_id Which runlist to obtain? */ -int get_runlist_iter(struct runlist_iter *rl_iter) { - struct entry_tsg head; - runlist_base_t rl_base; - runlist_info_t rl_info; - u64 runlist_iova; - struct gk20a *g = get_live_gk20a(); - if (!g) +int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl_iter) { + runlist_base_t rl_base; + runlist_info_t rl_info; + u64 runlist_iova; + *rl_iter = (struct runlist_iter){0}; + rl_base.raw = nvdebug_readl(g, NV_PFIFO_ENG_RUNLIST_BASE(rl_id)); + // Check that reads are working + if (rl_base.raw == -1) return -EIO; - rl_base.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST_BASE); - rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST); - runlist_iova = ((u64)rl_base.ptr) << 12; - printk(KERN_INFO "[nvdebug] Runlist ptr: %x, type: %d, raw: %x, IOVA: %px\n", - rl_base.ptr, rl_base.type, rl_base.raw, (void*)runlist_iova); - // TODO: Support reading video memory - if (rl_base.type == TARGET_VID_MEM) { - printk(KERN_ERR "[nvdebug] Runlist is located in video memory. Access to video memory is unimplemented."); - return -ENOTSUPP; + // The address pointed to `regs` + NV_PFIFO_RUNLIST_BASE seems to not be: + // - A GPU address (type is sysmem_coherent) + // - A physical address (dereferencing after ioremap crashes) + // - A kernel virtual address (dereferencing segfaults) + // So maybe it's some sort of custom thing? This is an address that the GPU + // can use, so it would make most sense for it to be a physical address. + // + // BUT, it can't possibly be a physical address, as it would refer to an + // address greater than the maximum one on our system (by a lot!). + // Maybe I'm reading the runlist base wrong? + // Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual + // address! So, what's this I/O address space? All I know is that it's what + // nvgpu_mem_get_addr() returns. That function returns the result of either: + // - gpu_phys_addr which is __nvgpu_sgl_phys on our platform which (?) + // converts an IPA to a PA? + // - nvgpu_mem_iommu_translate + // + // The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which + // returns SYSMEM. + // + // To convert a physical address to a IOMMU address, we add a bit + // + // BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working + // before because the GPU had simply gone to sleep and invalidated its + // register state, so nvgpu_readl() was simply returning garbage. + rl_info.raw = nvdebug_readl(g, NV_PFIFO_ENG_RUNLIST(rl_id)); + runlist_iova = ((u64)rl_base.ptr) << 12; + printk(KERN_INFO "[nvdebug] Runlist %d @ %llx in %s (config raw: %x)\n", + rl_id, runlist_iova, target_to_text(rl_base.target), rl_base.raw); + printk(KERN_INFO "[nvdebug] Runlist length %d, ID %d\n", rl_info.len, rl_info.id); + // Return early on an empty runlist + if (!rl_info.len) + return 0; + // If the runlist is in VID_MEM, search the BAR2/3 page tables for a mapping + if (rl_base.target == TARGET_VID_MEM) { + printk(KERN_WARNING "[nvdebug] Runlist is located in video memory. Access to video memory is experimental."); + bar_config_block_t bar1_block, bar2_block; + bar1_block.raw = nvdebug_readl(g, NV_PBUS_BAR1_BLOCK); + printk(KERN_INFO "[nvdebug] BAR1 inst block @ %llx in %s's %s address space.\n", ((u64)bar1_block.ptr) << 12, target_to_text(bar1_block.target), bar1_block.is_virtual ? "virtual" : "physical"); + bar2_block.raw = nvdebug_readl(g, NV_PBUS_BAR2_BLOCK); + printk(KERN_INFO "[nvdebug] BAR2 inst block @ %llx in %s's %s address space.\n", ((u64)bar2_block.ptr) << 12, target_to_text(bar2_block.target), bar1_block.is_virtual ? "virtual" : "physical"); + uint32_t bar_inst_pramin_offset = vram2PRAMIN(g, (uint64_t)bar2_block.ptr << 12); + if (!bar_inst_pramin_offset) { + printk(KERN_WARNING "[nvdebug] Unable to find instance block for BAR2/3 in the current NV_PRAMIN window. VRAM inaccessible.\n"); + return -EOPNOTSUPP; + } + /* TODO: Support BAR1? + bar_inst_pramin_offset = vram2PRAMIN(g, bar1_block.ptr << 12); + if (!bar_inst_pramin_offset) { + printk(KERN_WARNING "[nvdebug] Unable to find instance block for BAR1 in the current NV_PRAMIN window. VRAM inaccessible.\n"); + return -EOPNOTSUPP; + }*/ + // Instance blocks (size == 1kb) contain many things, but we only care about + // the section which describes the location of the page directory (page table) + uint32_t bar_pdb_config_pramin_offset = bar_inst_pramin_offset + NV_PRAMIN_PDB_CONFIG_OFF; + page_dir_config_t pd_config; + pd_config.raw = nvdebug_readq(g, bar_pdb_config_pramin_offset + NV_PRAMIN); + uint64_t bar_pdb_vram_addr = pd_config.page_dir_hi; + bar_pdb_vram_addr <<= 20; + bar_pdb_vram_addr |= pd_config.page_dir_lo; + bar_pdb_vram_addr <<= 12; + printk(KERN_INFO "[nvdebug] BAR2 PDB @ %llx in %s of version %s (config raw: %llx)\n", bar_pdb_vram_addr, target_to_text(pd_config.target), pd_config.is_ver2 ? "2" : "1", pd_config.raw); + // TODO: SYSMEM support for page table location + if (pd_config.target != TARGET_VID_MEM) { + printk(KERN_WARNING "[nvdebug] BAR2 PDB is in an unsupported location.\n"); + return -EOPNOTSUPP; + } + uint32_t bar_pdb_pramin_offset = vram2PRAMIN(g, bar_pdb_vram_addr); + if (!bar_pdb_pramin_offset) { + printk(KERN_WARNING "[nvdebug] Unable to find page directory BAR2/3 in the current NV_PRAMIN window. VRAM inaccessible.\n"); + return -EOPNOTSUPP; + } + uint64_t runlist_bar_vaddr; + if (pd_config.is_ver2) + runlist_bar_vaddr = search_page_directory(g, g->regs + NV_PRAMIN + bar_pdb_pramin_offset, phy2PRAMIN, runlist_iova); + else + runlist_bar_vaddr = search_v1_page_directory(g, g->regs + NV_PRAMIN + bar_pdb_pramin_offset, phy2PRAMIN, runlist_iova); + if (!runlist_bar_vaddr) { + printk(KERN_WARNING "[nvdebug] Unable to find runlist mapping in BAR2/3 page tables.\n"); + return -EOPNOTSUPP; + } + printk(KERN_INFO "[nvdebug] Runlist @ %llx in BAR2 virtual address space.\n", runlist_bar_vaddr); + /* XXX: Old test code + uint32_t bar2_pd_pramin_offset = vram_to_pramin_off(bar2_pd); + //walk_pd_subtree(bar2_pd_pramin_offset); + uint64_t runlist_bar2_vaddr = search_pd_subtree(bar2_pd_pramin_offset, runlist_iova); + page_dir_entry_t pde_0; + pde_0.raw = nvdebug_readl(g, NV_PRAMIN + bar2_pd_pramin_offset); + uint32_t pde_1 = nvdebug_readl(g, NV_PRAMIN + vram_to_pramin_off(((u64)pde_0.addr) << 12)); + uint64_t pde_bar2_vaddr = search_pd_subtree(bar2_pd_pramin_offset, ((u64)pde_0.addr) << 12); + uint32_t pde_2 = readl(g->bar3 + pde_bar2_vaddr); + printk(KERN_INFO "[nvdebug] PDE0 via PRAMIN: %x, via BAR3: %x\n", pde_1, pde_2); + */ + if (!g->bar3) { + printk(KERN_WARNING "[nvdebug] BAR2/3 not mapped.\n"); + return -ENODEV; + } + rl_iter->curr_entry = g->bar2 + runlist_bar_vaddr; + } else { + // Directly access the runlist if stored in SYS_MEM (physically addressed) + rl_iter->curr_entry = phys_to_virt(runlist_iova); } - // Segfaults - //u32 attempted_read = ioread32(runlist_iova); - //printk(KERN_INFO "[nvdebug] first word of runlist: %0x\n", attempted_read); - - // Errors out - //u32* virt_rt_addr = ioremap(phys_rl_addr, sizeof(struct entry_tsg)); - //printk(KERN_INFO "[nvdebug] Runlist virt_addr: %px\n", virt_rt_addr); - - /* Overcomplicated? - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - if (!domain) { - printk(KERN_INFO "[nvdebug] No IOMMU domain!\n"); - return -EIO; - } - u64 phys_addr = platform_bus_type.iommu_ops->iova_to_phys(domain, runlist_iova); - printk(KERN_INFO "[nvdebug] Runlist PA: %px\n", phys_addr); - */ - - printk(KERN_INFO "[nvdebug] Runlist phys_to_virt: %px\n", (void*)phys_to_virt(runlist_iova)); - printk(KERN_INFO "[nvdebug] Runlist *phys_to_virt: %x\n", *(u32*)phys_to_virt(runlist_iova)); - head = *(struct entry_tsg*)phys_to_virt(runlist_iova); - - rl_iter->curr_tsg = (struct entry_tsg*)phys_to_virt(runlist_iova); - rl_iter->rl_info = rl_info; - return 0; - //printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type); - //printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale); - //printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout); - //printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length); - //printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid); - - //printk(KERN_INFO "[nvdebug] Mem base phys: %p\n", (void*)virt_to_phys((void*)0xffffffc000000000ULL)); - //printk(KERN_INFO "[nvdebug] Mem end phys: %p\n", (void*)virt_to_phys((void*)0xffffffc400000000ULL)); - //printk(KERN_INFO "[nvdebug] Runlist *virt_addr: %x\n", readl(virt_rt_addr)); // This crashes - //read_bytes(&head, virt_rt_addr, sizeof(struct entry_tsg)); - /*printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type); - printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale); - printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout); - printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length); - printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid); */ + rl_iter->rl_info = rl_info; + return 0; } -int preempt_tsg(uint32_t tsg_id) { - struct gk20a *g = get_live_gk20a(); +int preempt_tsg(struct nvdebug_state *g, uint32_t tsg_id) { runlist_info_t rl_info; pfifo_preempt_t pfifo_preempt; runlist_disable_t rl_disable; diff --git a/runlist_procfs.c b/runlist_procfs.c index 411f844..a6b0d94 100644 --- a/runlist_procfs.c +++ b/runlist_procfs.c @@ -6,7 +6,14 @@ #define RUNLIST_PROCFS_NAME "runlist" #define DETAILED_CHANNEL_INFO -static int runlist_detail_seq_show_chan(struct seq_file *s, struct gk20a *g, uint32_t chid) { +/* Print channel details using PCCSR (Programmable Channel Control System RAM?) + * @param s Pointer to state from seq_file subsystem to pass to seq_printf + * @param g Pointer to our internal GPU state + * @param chid ID of channel to print details on, range [0, 512) + * @param prefix Text string to prefix each line with, or empty string + */ +#ifdef DETAILED_CHANNEL_INFO +static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix) { channel_ctrl_t chan; char *loc_txt; u64 instance_ptr; @@ -16,23 +23,37 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct gk20a *g, uin return -EIO; instance_ptr = chan.inst_ptr; instance_ptr <<= 12; - seq_printf(s, " +- Channel Info %-4d -+\n", chid); - seq_printf(s, " | Enabled: %d|\n", chan.enable); - seq_printf(s, " | Next: %d|\n", chan.next); - seq_printf(s, " | Force CTX Reload: %d|\n", chan.force_ctx_reload); - seq_printf(s, " | Enable set: %d|\n", chan.enable_set); - seq_printf(s, " | Enable clear: %d|\n", chan.enable_clear); - seq_printf(s, " | PBDMA Faulted: %d|\n", chan.pbdma_faulted); - seq_printf(s, " | ENG Faulted: %d|\n", chan.eng_faulted); - seq_printf(s, " | Status: %2d|\n", chan.status); - seq_printf(s, " | Busy: %d|\n", chan.busy); - seq_printf(s, " | Instance PTR: |\n"); - seq_printf(s, " | %#018llx |\n", instance_ptr); - seq_printf(s, " | %-20s|\n", loc_txt); - seq_printf(s, " | Instance bound: %d|\n", chan.inst_bind); - seq_printf(s, " +---------------------+\n"); + seq_printf(s, "%s+- Channel Info %-4d -+\n", prefix, chid); + seq_printf(s, "%s| Enabled: %d|\n", prefix, chan.enable); + seq_printf(s, "%s| Next: %d|\n", prefix, chan.next); + seq_printf(s, "%s| Force CTX Reload: %d|\n", prefix, chan.force_ctx_reload); + seq_printf(s, "%s| Enable set: %d|\n", prefix, chan.enable_set); + seq_printf(s, "%s| Enable clear: %d|\n", prefix, chan.enable_clear); + seq_printf(s, "%s| PBDMA Faulted: %d|\n", prefix, chan.pbdma_faulted); + seq_printf(s, "%s| ENG Faulted: %d|\n", prefix, chan.eng_faulted); + seq_printf(s, "%s| Status: %2d|\n", prefix, chan.status); + seq_printf(s, "%s| Busy: %d|\n", prefix, chan.busy); + seq_printf(s, "%s| Instance PTR: |\n", prefix); + seq_printf(s, "%s| %#018llx |\n", prefix, instance_ptr); + seq_printf(s, "%s| %-20s|\n", prefix, loc_txt); + seq_printf(s, "%s| Instance bound: %d|\n", prefix, chan.inst_bind); + // START TEMP + // "runlist_id -1 is synonym for the ENGINE_GR_GK20A runlist id" + // GR, GRCE, and ASYNC_CE + // Note that this appears to be broken?? + // Peek into the channel instance RAM + if (chan.inst_target == TARGET_SYS_MEM_COHERENT) { + seq_printf(s, "%s| Target Engine: %2d|\n", prefix, *(uint32_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*43/*NV_RAMFC_TARGET*/) & 0x1f); + seq_printf(s, "%s| PDB LO: %#08x|\n", prefix, *(uint32_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*128/*NV_RAMIN_PAGE_DIR_BASE_LO*/) & 0xfffff000); + seq_printf(s, "%s| Num subcontexts: %2ld|\n", prefix, hweight64(*(uint64_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*166/*NV_RAMIN_SC_PDB_VALID*/))); + // This appears to be unset on Xavier + //seq_printf(s, "%s| PAS ID: %8ld|\n", prefix, *(uint32_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*135/*NV_RAMIN_PASID*/) & 0xfffff); + } + // END TEMP + seq_printf(s, "%s+---------------------+\n", prefix); return 0; } +#endif #if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0) // Bug workaround. See comment in runlist_file_seq_start() @@ -41,10 +62,14 @@ static loff_t pos_fixup; static void *runlist_file_seq_start(struct seq_file *s, loff_t *pos) { static struct runlist_iter rl_iter; + struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)]; // *pos == 0 for first call after read of file if (*pos == 0) { - int err = get_runlist_iter(&rl_iter); + int err = get_runlist_iter(g, seq2gpuidx(s), &rl_iter); if (err) + return ERR_PTR(err); + // Don't try to print an empty runlist + if (rl_iter.rl_info.len <= 0) return NULL; return &rl_iter; } @@ -68,12 +93,13 @@ static void* runlist_file_seq_next(struct seq_file *s, void *raw_rl_iter, loff_t *pos) { struct runlist_iter* rl_iter = raw_rl_iter; void *ret = NULL; - // Advance by one TSG + channels under last TSG - *pos += 1 + rl_iter->curr_tsg->tsg_length; + struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)]; + // Advance by one TSG or channel + (*pos)++; + rl_iter->curr_entry += NV_RL_ENTRY_SIZE(g); // Verify we haven't reached the end of the runlist // rl_info.len is the num of tsg entries + total num of channel entries if (*pos < rl_iter->rl_info.len) { - rl_iter->curr_tsg = next_tsg(rl_iter->curr_tsg); ret = rl_iter; } #if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0) @@ -88,57 +114,57 @@ static void runlist_file_seq_stop(struct seq_file *s, void *raw_rl_iter) { } static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) { - struct entry_tsg* tsg = ((struct runlist_iter*)raw_rl_iter)->curr_tsg; - struct runlist_chan* chan; - struct gk20a *g = get_live_gk20a(); - if (!g) - return -EIO; - if (tsg->entry_type != ENTRY_TYPE_TSG) { - printk(KERN_WARNING "[nvdebug] Attempted to print non-TSG in tsg print logic!\n"); - return -EIO; - } - seq_printf(s, "+---- TSG Entry %-2d----+\n", tsg->tsgid); - seq_printf(s, "| Scale: %-13d|\n", tsg->timeslice_scale); - seq_printf(s, "| Timeout: %-11d|\n", tsg->timeslice_timeout); - seq_printf(s, "+---------------------+\n"); - for_chan_in_tsg(chan, tsg) { + struct runlist_iter *rl_iter = raw_rl_iter; + void *entry = rl_iter->curr_entry; + struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)]; + if (entry_type(g, entry) == ENTRY_TYPE_TSG) { + if (rl_iter->channels_left_in_tsg) { + printk(KERN_WARNING "[nvdebug] Found a TSG @ %px when %d channels were still expected under the previous TSG in the runlist!\n", entry, rl_iter->channels_left_in_tsg); + return -EIO; + } + rl_iter->channels_left_in_tsg = tsg_length(g, entry); + seq_printf(s, "+---- TSG Entry %-3d---+\n", tsgid(g, entry)); + seq_printf(s, "| Scale: %-13d|\n", timeslice_scale(g, entry)); + seq_printf(s, "| Timeout: %-11d|\n", timeslice_timeout(g, entry)); + seq_printf(s, "| Length: %-12d|\n", tsg_length(g, entry)); + seq_printf(s, "+---------------------+\n"); + } else { + char *indt = ""; #ifndef DETAILED_CHANNEL_INFO - char* loc_txt; - u64 instance_ptr; + u64 instance_ptr = 0; #endif - if (chan->entry_type != ENTRY_TYPE_CHAN) { - printk(KERN_WARNING "[nvdebug] Attempted to print non-channel in channel print logic!\n"); - return -EIO; + if (rl_iter->channels_left_in_tsg) { + indt = " "; + rl_iter->channels_left_in_tsg--; } #ifdef DETAILED_CHANNEL_INFO - runlist_detail_seq_show_chan(s, g, chan->chid); + runlist_detail_seq_show_chan(s, g, chid(g, entry), indt); #else - loc_txt = target_to_text(chan->inst_target); - if (!loc_txt) { - printk(KERN_WARNING "[nvdebug] Invalid apature in channel print logic!\n"); - return -EIO; - } // Reconstruct pointer to channel instance block - instance_ptr = chan->inst_ptr_hi; - instance_ptr <<= 32; - instance_ptr |= chan->inst_ptr_lo << 12; - - seq_printf(s, " +- Channel Entry %-4d-+\n", chan->chid); - seq_printf(s, " | Runqueue Selector: %d|\n", chan->runqueue_selector); - seq_printf(s, " | Instance PTR: |\n"); - seq_printf(s, " | %#018llx |\n", instance_ptr); - seq_printf(s, " | %-20s|\n", loc_txt); - seq_printf(s, " +---------------------+\n"); + if (g->chip_id >= NV_CHIP_ID_VOLTA) { + instance_ptr = ((struct gv100_runlist_chan*)entry)->inst_ptr_hi; + instance_ptr <<= 32; + } + instance_ptr |= inst_ptr_lo(g, entry) << 12; + + seq_printf(s, "%s+- Channel Entry %-4d-+\n", indt, chid(g, entry)); + if (g->chip_id >= NV_CHIP_ID_VOLTA) + seq_printf(s, "%s| Runqueue Selector: %d|\n", indt, + ((struct gv100_runlist_chan*)entry)->runqueue_selector); + seq_printf(s, "%s| Instance PTR: |\n", indt); + seq_printf(s, "%s| %#018llx |\n", indt, instance_ptr); + seq_printf(s, "%s| %-20s|\n", indt, target_to_text(inst_target(g, entry))); + seq_printf(s, "%s+---------------------+\n", indt); #endif } return 0; } static const struct seq_operations runlist_file_seq_ops = { - .start = runlist_file_seq_start, - .next = runlist_file_seq_next, - .stop = runlist_file_seq_stop, - .show = runlist_file_seq_show, + .start = runlist_file_seq_start, + .next = runlist_file_seq_next, + .stop = runlist_file_seq_stop, + .show = runlist_file_seq_show, }; static int runlist_file_open(struct inode *inode, struct file *f) { @@ -157,6 +183,7 @@ ssize_t preempt_tsg_file_write(struct file *f, const char __user *buffer, uint32_t target_tsgid; // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec int err = kstrtou32_from_user(buffer, count, 0, &target_tsgid); + struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)]; if (err) return err; @@ -165,7 +192,7 @@ ssize_t preempt_tsg_file_write(struct file *f, const char __user *buffer, return -ERANGE; // Execute preemption - err = preempt_tsg(target_tsgid); + err = preempt_tsg(g, target_tsgid); if (err) return err; @@ -181,9 +208,9 @@ ssize_t disable_channel_file_write(struct file *f, const char __user *buffer, uint32_t target_channel; channel_ctrl_t chan; int err; - struct gk20a *g = get_live_gk20a(); - if (!g) - return -EIO; + runlist_info_t rl_info; + runlist_disable_t rl_disable; + struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)]; // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec err = kstrtou32_from_user(buffer, count, 0, &target_channel); if (err) @@ -195,7 +222,16 @@ ssize_t disable_channel_file_write(struct file *f, const char __user *buffer, // Disable channel chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel)); chan.enable_clear = true; + // disable sched + rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST); + rl_disable.raw = nvdebug_readl(g, NV_PFIFO_SCHED_DISABLE); + rl_disable.raw |= BIT(rl_info.id); + nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw); + // disable chan nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(target_channel), chan.raw); + // enable sched + rl_disable.raw &= ~BIT(rl_info.id); + nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw); return count; } @@ -209,9 +245,7 @@ ssize_t enable_channel_file_write(struct file *f, const char __user *buffer, uint32_t target_channel; channel_ctrl_t chan; int err; - struct gk20a *g = get_live_gk20a(); - if (!g) - return -EIO; + struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)]; // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec err = kstrtou32_from_user(buffer, count, 0, &target_channel); if (err) @@ -235,14 +269,12 @@ const struct file_operations enable_channel_file_ops = { ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer, size_t count, loff_t *off) { uint32_t target_tsgid; - struct runlist_chan* chan; + struct gv100_runlist_chan* chan; channel_ctrl_t chan_ctl; struct runlist_iter rl_iter; int err; loff_t pos = 0; - struct gk20a *g = get_live_gk20a(); - if (!g) - return -EIO; + struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)]; // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec err = kstrtou32_from_user(buffer, count, 0, &target_tsgid); if (err) @@ -251,32 +283,34 @@ ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer, if (target_tsgid > MAX_TSGID) return -ERANGE; - err = get_runlist_iter(&rl_iter); + err = get_runlist_iter(g, 0, &rl_iter); if (err) return err; // Iterate through all TSGs while (pos < rl_iter.rl_info.len) { - if (rl_iter.curr_tsg->tsgid == target_tsgid) { + if (tsgid(g, rl_iter.curr_entry) == target_tsgid) { // Enable channels of target TSG - for_chan_in_tsg(chan, rl_iter.curr_tsg) { + for_chan_in_tsg(g, chan, rl_iter.curr_entry) { chan_ctl.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chan->chid)); chan_ctl.enable_set = true; nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(chan->chid), chan_ctl.raw); } } else { + // XXX: Fix for bare channels. Maybe a "for_chan_until_tsg" macro? // Disable all other channels - for_chan_in_tsg(chan, rl_iter.curr_tsg) { + // (This is how the Jetson nvgpu driver disables TSGs) + for_chan_in_tsg(g, chan, rl_iter.curr_entry) { chan_ctl.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chan->chid)); chan_ctl.enable_clear = true; nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(chan->chid), chan_ctl.raw); } } - pos += 1 + rl_iter.curr_tsg->tsg_length; - rl_iter.curr_tsg = next_tsg(rl_iter.curr_tsg); + pos += 1 + tsg_length(g, rl_iter.curr_entry); + rl_iter.curr_entry = next_tsg(g, rl_iter.curr_entry); } // Switch to next TSG with active channels (should be our TSG) - err = preempt_tsg(target_tsgid); + err = preempt_tsg(g, target_tsgid); if (err) return err; diff --git a/stubs.h b/stubs.h new file mode 100644 index 0000000..bfcc0d7 --- /dev/null +++ b/stubs.h @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helpful private functions copied from elsewhere in the kernel tree + * DO NOT MODIFY + */ +#include + +// Functions from drivers/pci/pci.h +/** + * pci_match_one_device - Tell if a PCI device structure has a matching + * PCI device id structure + * @id: single PCI device id structure to match + * @dev: the PCI device structure to match against + * + * Returns the matching pci_device_id structure or %NULL if there is no match. + */ +static inline const struct pci_device_id * +pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) +{ + if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) && + (id->device == PCI_ANY_ID || id->device == dev->device) && + (id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) && + (id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) && + !((id->class ^ dev->class) & id->class_mask)) + return id; + return NULL; +} + +// Functions from drivers/pci/search.h +#include +#include +extern struct bus_type pci_bus_type; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5,3,0) +static int match_pci_dev_by_id(struct device *dev, void *data) +#else +static int match_pci_dev_by_id(struct device *dev, const void *data) +#endif +{ + struct pci_dev *pdev = to_pci_dev(dev); + const struct pci_device_id *id = data; + + if (pci_match_one_device(id, pdev)) + return 1; + return 0; +} + +/* + * pci_get_dev_by_id - begin or continue searching for a PCI device by id + * @id: pointer to struct pci_device_id to match for the device + * @from: Previous PCI device found in search, or %NULL for new search. + * + * Iterates through the list of known PCI devices. If a PCI device is found + * with a matching id a pointer to its device structure is returned, and the + * reference count to the device is incremented. Otherwise, %NULL is returned. + * A new search is initiated by passing %NULL as the @from argument. Otherwise + * if @from is not %NULL, searches continue from next device on the global + * list. The reference count for @from is always decremented if it is not + * %NULL. + * + * This is an internal function for use by the other search functions in + * this file. + */ +static struct pci_dev *pci_get_dev_by_id(const struct pci_device_id *id, + struct pci_dev *from) +{ + struct device *dev; + struct device *dev_start = NULL; + struct pci_dev *pdev = NULL; + + if (from) + dev_start = &from->dev; + dev = bus_find_device(&pci_bus_type, dev_start, (void *)id, + match_pci_dev_by_id); + if (dev) + pdev = to_pci_dev(dev); + pci_dev_put(from); + return pdev; +} + -- cgit v1.2.2