Quick dump of current state for Ben to review.

author: Joshua Bakita <bakitajoshua@gmail.com> 2023-06-22 12:52:59 -0400
committer: Joshua Bakita <bakitajoshua@gmail.com> 2023-06-22 12:52:59 -0400
commit: 306a03d18b305e4e573be3b2931978fa10679eb9 (patch)
tree: 349570dfbe5f531e903c949c3f663627ee1097a8
parent: f4b83713672acaf88a526b930b8e417453f6edc5 (diff)
8 files changed, 1614 insertions, 272 deletions
diff --git a/Makefile b/Makefile
index 18c07e8..2dc90c7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,13 +1,14 @@
 obj-m += nvdebug.o
-nvdebug-objs = runlist_procfs.o runlist.o nvdebug_entry.o
+nvdebug-objs = runlist_procfs.o device_info_procfs.o runlist.o mmu.o nvdebug_entry.o
 KBUILD_CFLAGS += -DGIT_HASH=\"$(shell git --git-dir=$(PWD)/.git rev-parse --short HEAD)\"
+# -mfentry above if not building due to mcount missing
 # TODO: Avoid needing to distribute NVIDIA's headers (at least they're MIT...)
-#ccflags-y += -I$(PWD)/include
+ccflags-y += -I$(PWD)/include
-ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include
+#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include
-ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu
+#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu
-ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include
+#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include
-ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include/uapi
+#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include/uapi
 all:
        make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
diff --git a/device_info_procfs.c b/device_info_procfs.c
new file mode 100644
index 0000000..cd6c53c
--- /dev/null
+++ b/device_info_procfs.c
@@ -0,0 +1,126 @@
+#include "nvdebug.h"
+#include <linux/seq_file.h> // For seq_* functions and types
+#include <linux/uaccess.h> // For copy_to_user()
+// Generic register printing function, used for PTOP_*_NUM registers (+more)
+// @param f    File being read from. `data` field is register offset to read.
+// @param buf  User buffer for result
+// @param size Length of user buffer
+// @param off  Requested offset. Updated by number of characters written.
+// @return -errno on error, otherwise number of bytes written to *buf
+// Note: Parent `data` field MUST be the GPU index
+static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, loff_t *off) {
+        char out[16];
+        int chars_written;
+        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
+        if (size < 16 || *off != 0)
+                return 0;
+        // 32 bit register will always take less than 16 characters to print
+        chars_written = scnprintf(out, 16, "%#0x\n", nvdebug_readl(g, (uintptr_t)PDE_DATA(file_inode(f))));
+        if (copy_to_user(buf, out, chars_written))
+                printk(KERN_WARNING "Unable to copy all data for %s\n", file_dentry(f)->d_name.name);
+        *off += chars_written;
+        return chars_written;
+}
+const struct file_operations nvdebug_read_reg32_file_ops = {
+        .read = nvdebug_reg32_read,
+};
+//// ==v== PTOP_DEVICE_INFO ==v== ////
+// Called to start or resume a sequence. Prior to 4.19, *pos is unreliable.
+// Initializes iterator `idx` state and returns it. Ends sequence on NULL.
+static void* device_info_file_seq_start(struct seq_file *s, loff_t *pos) {
+        static int idx;
+        // If start of sequence, reset `idx`
+        if (*pos == 0)
+                idx = 0;
+        // Number of possible info entries is fixed, and list is sparse
+        if (idx >= NV_PTOP_DEVICE_INFO__SIZE_1)
+                return NULL;
+        return &idx;
+}
+// Steps to next record. Returns new value of `idx`.
+// Calls show() on non-NULL return
+static void* device_info_file_seq_next(struct seq_file *s, void *idx,
+                                       loff_t *pos) {
+        (*pos)++; // Required by seq interface
+        // Number of possible info entries is fixed, and list is sparse
+        if ((*(int*)idx)++ >= NV_PTOP_DEVICE_INFO__SIZE_1)
+                return NULL;
+        return idx;
+}
+// Print info at index *idx. Returns non-zero on error.
+static int device_info_file_seq_show(struct seq_file *s, void *idx) {
+        ptop_device_info_t curr_info;
+        struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)];
+        
+        curr_info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO(*(int*)idx));
+        // Check for read errors
+        if (curr_info.raw == -1)
+                return -EIO;
+        // Parse and print the data
+        switch(curr_info.info_type) {
+        case INFO_TYPE_DATA:
+                // As of early 2022, only the ENUM2 format of this entry exists
+                if (curr_info.is_not_enum2)
+                        break;
+                seq_printf(s, "| BAR0 Base %#.8x\n"
+                              "|           instance %d\n",
+                        curr_info.pri_base << 12, curr_info.inst_id);
+                if (curr_info.fault_id_is_valid)
+                        seq_printf(s, "| Fault ID:        %3d\n", curr_info.fault_id);
+                break;
+        case INFO_TYPE_ENUM:
+                if (curr_info.engine_is_valid)
+                        seq_printf(s, "| Host's Engine ID: %2d\n", curr_info.engine_enum);
+                if (curr_info.runlist_is_valid)
+                        seq_printf(s, "| Runlist ID:       %2d\n", curr_info.runlist_enum);
+                if (curr_info.intr_is_valid)
+                        seq_printf(s, "| Interrupt ID:     %2d\n", curr_info.intr_enum);
+                if (curr_info.reset_is_valid)
+                        seq_printf(s, "| Reset ID:         %2d\n", curr_info.reset_enum);
+                break;
+        case INFO_TYPE_ENGINE_TYPE:
+                seq_printf(s, "| Engine Type:      %2d (", curr_info.engine_type);
+                if (curr_info.engine_type < ENGINE_TYPES_LEN)
+                        seq_printf(s, "%s)\n", ENGINE_TYPES_NAMES[curr_info.engine_type]);
+                else
+                        seq_printf(s, "Unknown Engine, introduced post-Ampere)\n");
+                break;
+        case INFO_TYPE_NOT_VALID:
+        default:
+                // Device info records are sparse, so skip unset or unknown ones
+                return 0;
+        }
+        // Draw a line between each device entry
+        if (!curr_info.has_next_entry)
+                seq_printf(s, "+---------------------+\n");
+        return 0;
+}
+static void device_info_file_seq_stop(struct seq_file *s, void *idx) {
+        // No cleanup needed
+}
+static const struct seq_operations device_info_file_seq_ops = {
+        .start = device_info_file_seq_start,
+        .next = device_info_file_seq_next,
+        .stop = device_info_file_seq_stop,
+        .show = device_info_file_seq_show,
+};
+static int device_info_file_open(struct inode *inode, struct file *f) {
+        return seq_open(f, &device_info_file_seq_ops);
+}
+const struct file_operations device_info_file_ops = {
+        .open = device_info_file_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = seq_release,
+};
diff --git a/mmu.c b/mmu.c
new file mode 100644
index 0000000..26c7af5
--- /dev/null
+++ b/mmu.c
@@ -0,0 +1,251 @@
+// Helpers to deal with NVIDIA's MMU and associated page tables
+#include <linux/kernel.h>  // Kernel types
+#include "nvdebug.h"
+/* One of the oldest ways to access video memory on NVIDIA GPUs is by using
+  a configurable 1MB window into VRAM which is mapped into BAR0 (register)
+  space starting at offset NV_PRAMIN. This is still supported on NVIDIA GPUs
+  and appear to be used today to bootstrap page table configuration.
+  Why is it mapped at a location called NVIDIA Private RAM Instance? Because
+  this used to point to the entirety of intance RAM, which was seperate from
+  VRAM on older NVIDIA GPUs.
+*/
+/* Convert a physical VRAM address to an offset in the PRAMIN window
+  @param addr VRAM address to convert
+  @return 0 on error, PRAMIN offset on success
+  Note: Use off2PRAMIN() instead if you want a dereferenceable address
+*/
+uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr) {
+        uint64_t pramin_base_va;
+        bar0_window_t window;
+        window.raw = nvdebug_readl(g, NV_PBUS_BAR0_WINDOW);
+        // Check if the address is valid (49 bits are addressable on-GPU)
+        if (addr & ~0x0001ffffffffffff) {
+                printk(KERN_ERR "[nvdebug] Invalid address %llx passed to %s!\n",
+                       addr, __func__);
+                return 0;
+        }
+        // For unclear (debugging?) reasons, PRAMIN can point to SYSMEM
+        if (window.target != TARGET_VID_MEM)
+                return 0;
+        pramin_base_va = ((uint64_t)window.base) << 16;
+        // Protect against out-of-bounds accesses
+        if (addr < pramin_base_va || addr > pramin_base_va + NV_PRAMIN_LEN)
+                return 0;
+        return addr - pramin_base_va;
+}
+/* NVIDIA GMMU (GPU Memory Management Unit) uses page tables that are mostly
+  straight-forward starting with Pascal ("page table version 2"), except for a
+  few quirks (like 16-byte PDE0 entries, but all other entries are 8 bytes).
+  All you really need to know is that any given Page Directory Entry (PDE)
+  contains a pointer to the start of a 4k page densely filled with PDEs or Page
+  Table Entries (PTEs).
+  == Page Table Refresher ==
+  Page tables convert virtual addresses to physical addresses, and they do this
+  via a tree structure. Leafs (PTEs) contain a physical address, and the path
+  from root to leaf is defined by the virtual address. Non-leaf nodes are PDEs.
+  When decending, the virtual address is sliced into pieces, and one slice is
+  used at each level (as an index) to select the next-visited node (in level+1).
+  V2 of NVIDIA's page table format uses 4 levels of PDEs and a final level of
+  PTEs. How the virtual address is sliced to yield an index into each level and
+  a page offset is shown by Fig 1.
+  == Figure 1 ==
+  Page Offset (12 bits) <---------------------------------------+
+  Page Table Entry (PTE) (9 bits) <--------------------+        |
+  Page Directory Entry (PDE) 0 (8 bits) <-----+        |        |
+  PDE1 (8 bits) <--------------------+        |        |        |
+  PDE2 (8 bits) <-----------+        |        |        |        |
+  PDE3 (2 bits) <--+        |        |        |        |        |
+                   ^        ^        ^        ^        ^        ^
+  Virtual addr: [49, 47] [46, 38] [37, 29] [28, 21] [20, 12] [11, 0]
+  The following arrays merely represent different projections of Fig. 1, and
+  only one is strictly needed to reconstruct all the others. However, due to
+  the complexity of page tables, we include all of these to aid in readability.
+*/
+// How many nodes/entries per level in V2 of NVIDIA's page table format
+static const int NV_MMU_PT_V2_SZ[5] = {4, 512, 512, 256, 512};
+// Size in bytes of an entry at a particular level
+static const int NV_MMU_PT_V2_ENTRY_SZ[5] = {8, 8, 8, 16, 8};
+// Which bit index is the least significant in indexing each page level
+static const int NV_MMU_PT_V2_LSB[5] = {47, 38, 29, 21, 12};
+// Convert a GPU physical address to CPU virtual address via the PRAMIN window
+void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy) {
+        return g->regs + NV_PRAMIN + vram2PRAMIN(g, phy);
+}
+/* FIXME
+void __iomem *off2BAR2(struct nvdebug_state* g, uint32_t off) {
+        return g->bar2 + off;
+}
+*/
+uint64_t search_page_directory_subtree(struct nvdebug_state *g,
+                                       void __iomem *pde_offset,
+                                       void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+                                       uint64_t addr_to_find,
+                                       uint32_t level) {
+        uint64_t res, i;
+        void __iomem *next;
+        page_dir_entry_t entry;
+        if (level > sizeof(NV_MMU_PT_V2_SZ))
+                return 0;
+        // Hack to workaround PDE0 being double-size and strangely formatted
+        if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
+                pde_offset += 8;
+        entry.raw = readl(pde_offset);
+        // If we reached an invalid (unpopulated) PDE, walk back up the tree
+        if (entry.target == PD_AND_TARGET_INVALID)
+                return 0;
+        // Succeed when we reach a PTE with the address we want
+        if (entry.is_pte) {
+                printk(KERN_INFO "[nvdebug] PTE for phy addr %llx (raw: %x)\n", ((u64)entry.addr) << 12, entry.raw);
+                return (uint64_t)entry.addr << 12 == addr_to_find;
+        }
+        printk(KERN_INFO "[nvdebug] Found PDE pointing to %llx in ap '%d' at lvl %d (raw: %x)\n", ((u64)entry.addr) << 12, entry.target, level, entry.raw);
+        // Depth-first search of the page table
+        for (i = 0; i < NV_MMU_PT_V2_SZ[level]; i++) {
+                next = off2addr(g, ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i);
+                // off2addr can fail
+                if (!next) {
+                        printk(KERN_ERR "[nvdebug] %s: Unable to resolve GPU PA to CPU PA\n", __func__);
+                        return 0;
+                }
+                res = search_page_directory_subtree(g, next, off2addr, addr_to_find, level + 1);
+                if (res)
+                        return res | (i << NV_MMU_PT_V2_LSB[level + 1]);
+        }
+        return 0;
+}
+/* Search a page directory of the GPU MMU
+  @param pde_offset   Dereferenceable pointer to the start of the PDE3 entries
+  @param off2addr     Func to converts VRAM phys addresses to valid CPU VAs
+  @param addr_to_find Physical address to reconstruct the virtual address of
+  @return 0 on error, otherwise the virtual address at which addr_to_find is
+          mapped into by this page table.
+*/
+uint64_t search_page_directory(struct nvdebug_state *g,
+                               void __iomem *pde_offset,
+                               void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+                               uint64_t addr_to_find) {
+        uint64_t res, i;
+        // Make sure that the query is page-aligned
+        if (addr_to_find & 0xfff) {
+                printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find);
+                return 0;
+        }
+        // Search the top-level page directory (PDE3)
+        for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++)
+                if ((res = search_page_directory_subtree(g, pde_offset + NV_MMU_PT_V2_ENTRY_SZ[0] * i, off2addr, addr_to_find, 0)))
+                        return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]);
+        return 0;
+}
+/* GMMU Page Tables Version 1
+  This page table only contains 2 levels and is used in the Fermi, Kepler, and
+  Maxwell architectures
+*/
+// Number of entries in the PDE and PTE levels
+static const int NV_MMU_PT_V1_SZ[2] = {512, 1<<13};  // 2<<13 is an educated guess!!!
+// Which bit index is the least significant in indexing each page level
+static const int NV_MMU_PT_V1_LSB[2] = {25, 12};  // 25 is an educated guess!!!
+uint64_t search_v1_page_directory(struct nvdebug_state *g,
+                                  void __iomem *pde_offset,
+                                  void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+                                  uint64_t addr_to_find) {
+        uint64_t j, i = 0;
+        page_dir_entry_v1_t pde;
+        page_tbl_entry_v1_t pte;
+        void __iomem *pte_offset;
+        // For each PDE
+        do {
+                // readq doesn't seem to work on BAR0
+                pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v1_t) + 4);
+                pde.raw <<= 32;
+                pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v1_t));
+                // Verify PDE is present
+                if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
+                        continue;
+                // Convert to a dereferencable pointer from CPU virtual address space
+                pte_offset = off2addr(g, (uint64_t)pde.alt_addr << 12);
+                if (!pte_offset)
+                        continue;
+//              printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw);
+//              printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.target, pde.raw);
+                // For each PTE
+                for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
+                        // Don't overrun the PRAMIN window
+                        if (pte_offset > NV_PRAMIN + g->regs + NV_PRAMIN_LEN)
+                                return 0;
+                        pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v1_t) + 4);
+                        pte.raw <<= 32;
+                        pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v1_t));
+                        // Skip non-present PTEs
+                        if (!pte.is_present)
+                                continue;
+//                      printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw);
+                        // If we find a matching PTE, return its virtual address
+                        if ((uint64_t)pte.addr << 12 == addr_to_find)
+                                return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1];
+                        
+                }
+        } while (++i < NV_MMU_PT_V1_SZ[0]);
+        return 0;
+}
+/* GMMU Page Tables Version 0
+  This page table only contains 2 levels and is used in the Tesla architecture
+*/
+/* *** UNTESTED ***
+#define NV_MMU_PT_V0_SZ 2048
+#define NV_MMU_PT_V0_LSB 29
+uint64_t search_v0_page_directory(struct nvdebug_state *g,
+                                  void __iomem *pde_offset,
+                                  void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t),
+                                  uint32_t addr_to_find) {
+        int j, i = 0;
+        page_dir_entry_v0_t pde;
+        page_tbl_entry_v0_t pte;
+        void __iomem *pte_offset;
+        // For each PDE
+        do {
+                // readq doesn't seem to work on BAR0
+                pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4);
+                pde.raw <<= 32;
+                pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t));
+                //if (pde.raw)
+                //printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw);
+                // Skip unpopulated PDEs
+                if (pde.type == NOT_PRESENT)
+                        continue;
+                //printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12);
+                pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12);
+                // For each PTE
+                for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) {
+                        pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4);
+                        pte.raw <<= 32;
+                        pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t));
+                        // Skip non-present PTEs
+                        if (!pte.is_present)
+                                continue;
+                        // If we find a matching PTE, return its virtual address
+                        //if (pte.addr != 0x5555555)
+                        //      printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present");
+                        if (pte.addr << 12 == addr_to_find)
+                                return i << NV_MMU_PT_V0_LSB | j << 12;
+                }
+        } while (++i < NV_MMU_PT_V0_SZ);
+        return 0;  // No match
+}
+*/
diff --git a/nvdebug.h b/nvdebug.h
index 9ac71da..1882756 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -5,14 +5,18 @@
 // TODO(jbakita): Don't depend on these.
 #include <nvgpu/gk20a.h>  // For struct gk20a
 #include <os/linux/os_linux.h>  // For struct nvgpu_os_linux
+#include <linux/proc_fs.h>  // For PDE_DATA() macro
 /* Runlist Channel
  A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
  of GPU commands. These commands are typically queued from userspace.
-  `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU
+  Prior to Volta, channels could also exist independent of a TSG. These are
-  virtual address space for this context. All channels in a TSG point to the
+  called "bare channels" in the Jetson nvgpu driver.
-  same GPU Instance Block (?).
+  `INST_PTR` points to a GPU Instance Block which contains FIFO states, virtual
+  address space configuration for this context, and a pointer to the page
+  tables. All channels in a TSG point to the same GPU Instance Block (?).
  "RUNQUEUE_SELECTOR determines to which runqueue the channel belongs, and
  thereby which PBDMA will run the channel.  Increasing values select
@@ -30,7 +34,13 @@
  ENTRY_TYPE (T)        : type of this entry: ENTRY_TYPE_CHAN
  CHID (ID)             : identifier of the channel to run (overlays ENTRY_ID)
  RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if
-                          more than one PBDMA is supported by the runlist
+                          more than one PBDMA is supported by the runlist,
+                          additionally, "A value of 0 targets the first FE
+                          pipe, which can process all FE driven engines:
+                          Graphics, Compute, Inline2Memory, and TwoD.  A value
+                          of 1 targets the second FE pipe, which can only
+                          process Compute work.  Note that GRCE work is allowed
+                          on either runqueue.)"
  INST_PTR_LO           : lower 20 bits of the 4k-aligned instance block pointer
  INST_PTR_HI           : upper 32 bit of instance block pointer
@@ -39,6 +49,9 @@
  USERD_PTR_LO          : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer
  USERD_PTR_HI          : upper 32 bits of USERD pointer
  USERD_TARGET (TGU)    : aperture of the USERD data structure
+  Channels were around since at least Fermi, but were rearranged with Volta to
+  add a USERD pointer, a longer INST pointer, and a runqueue selector flag.
 */
 enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
 enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
@@ -52,11 +65,12 @@ static inline char* target_to_text(enum INST_TARGET t) {
                        return "SYS_MEM_NONCOHERENT";
                default:
                        printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
-                        return NULL;
+                        return "INVALID";
        }
 }
-struct runlist_chan {
+// Support: Volta, Ampere, Turing
+struct gv100_runlist_chan {
 // 0:63
        enum ENTRY_TYPE entry_type:1;
        uint32_t runqueue_selector:1;
@@ -71,6 +85,20 @@ struct runlist_chan {
        uint32_t inst_ptr_hi:32;
 } __attribute__((packed));
+// Support: Fermi, Kepler*, Maxwell, Pascal
+// *In Kepler, inst fields may be unpopulated?
+struct gm107_runlist_chan {
+        uint32_t chid:12;
+         uint32_t padding0:1;
+        enum ENTRY_TYPE entry_type:1;
+         uint32_t padding1:18;
+        uint32_t inst_ptr_lo:20;
+        enum INST_TARGET inst_target:2;  // Totally guessing on this
+         uint32_t padding2:10;
+} __attribute__((packed));
+#define gk110_runlist_chan gm107_runlist_chan
 /* Runlist TSG (TimeSlice Group)
  The runlist is composed of timeslice groups (TSG). Each TSG corresponds
  to a single virtual address space on the GPU and contains `TSG_LENGTH`
@@ -85,8 +113,15 @@ struct runlist_chan {
  TIMESLICE_TIMEOUT   : timeout amount for the TSG's timeslice
  TSG_LENGTH          : number of channels that are part of this timeslice group
  TSGID               : identifier of the Timeslice group (overlays ENTRY_ID)
+  TSGs appear to have been introduced with Kepler and stayed the same until
+  they were rearranged at the time of channel rearrangement to support longer
+  GPU instance addresses with Volta.
 */
-struct entry_tsg {
+// Support: Volta, Ampere*, Turing*
+// *These treat the top 8 bits of TSGID as GFID (unused)
+struct gv100_runlist_tsg {
 // 0:63
        enum ENTRY_TYPE entry_type:1;
         uint64_t padding:15;
@@ -101,14 +136,28 @@ struct entry_tsg {
 } __attribute__((packed));
 #define MAX_TSGID (1 << 12)
+// Support: Kepler (v2?), Maxwell, Pascal
+// Same fields as Volta except tsg_length is 6 bits rather than 8
+// Last 32 bits appear to contain an undocumented inst ptr
+struct gk110_runlist_tsg {
+        uint32_t tsgid:12;
+         uint32_t padding0:1;
+        enum ENTRY_TYPE entry_type:1;
+        uint32_t timeslice_scale:4;
+        uint32_t timeslice_timeout:8;
+        uint32_t tsg_length:6;
+         uint32_t padding1:32;
+} __attribute__((packed));
 enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1};
 /* Preempt a TSG or Channel by ID
-  ID/CHID             : Id of TSG or channel to preempt
+  ID/CHID     : Id of TSG or channel to preempt
-  IS_PENDING          : ????
+  IS_PENDING  : Is a context switch pending?
-  TYPE                : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG
+  TYPE        : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG
-  Support: Kepler, Maxwell, Pascal, Volta
+  Support: Kepler, Maxwell, Pascal, Volta, Turing
 */
 #define NV_PFIFO_PREEMPT 0x00002634
 typedef union {
@@ -195,26 +244,36 @@ typedef union {
 */
 // Note: This is different with Turing
-// Support: Kepler, Maxwell, Pascal, Volta
+// Support: Fermi, Kepler, Maxwell, Pascal, Volta
 #define NV_PFIFO_RUNLIST_BASE 0x00002270
+#define NV_PFIFO_ENG_RUNLIST_BASE(i) (0x00002280+(i)*8)
 typedef union {
        struct {
                uint32_t ptr:28;
-                uint32_t type:2;
+                enum INST_TARGET target:2;
                 uint32_t padding:2;
        } __attribute__((packed));
        uint32_t raw;
 } runlist_base_t;
 // Support: Kepler, Maxwell, Pascal, Volta
+// Works on Fermi, but id is one bit longer and is b11111
 #define NV_PFIFO_RUNLIST 0x00002274
+#define NV_PFIFO_ENG_RUNLIST(i) (0x00002284+(i)*8)
 typedef union {
+        // RUNLIST fields
        struct {
                uint32_t len:16;
                 uint32_t padding:4;
-                uint32_t id:4;
+                uint32_t id:4; // Runlist ID (each engine may have a seperate runlist)
                 uint32_t padding2:8;
        } __attribute__((packed));
+        // ENG_RUNLIST fields that differ
+        struct {
+                 uint32_t padding3:20;
+                bool is_pending:1; // Is runlist not yet committed?
+                 uint32_t padding4:11;
+        } __attribute__((packed));
        uint32_t raw;
 } runlist_info_t;
@@ -301,63 +360,631 @@ typedef union {
        uint32_t raw;
 } runlist_disable_t;
+/* Read GPU descriptors from the Master Controller (MC)
+  MINOR_REVISION  : Legacy (only used with Celvin in Nouveau)
+  MAJOR_REVISION  : Legacy (only used with Celvin in Nouveau)
+  IMPLEMENTATION  : Which implementation of the GPU architecture
+  ARCHITECTURE    : Which GPU architecture
+  CHIP_ID = IMPLEMENTATION + ARCHITECTURE << 4
+  CHIP_ID         : Unique ID of all chips since Kelvin
+  Support: Kelvin, Rankline, Curie, Tesla, Fermi, Kepler, Maxwell, Pascal,
+           Volta, Turing, Ampere
+*/
+#define NV_MC_BOOT_0 0x00000000
+#define NV_CHIP_ID_GP106 0x136 // Discrete GeForce GTX 1060
+#define NV_CHIP_ID_GV11B 0x15B // Jetson Xavier embedded GPU
+#define NV_CHIP_ID_KEPLER 0x0E0
+#define NV_CHIP_ID_VOLTA 0x140
+inline static const char* ARCH2NAME(uint32_t arch) {
+        switch (arch) {
+        case 0x01:
+                return "Celsius";
+        case 0x02:
+                return "Kelvin";
+        case 0x03:
+                return "Rankline";
+        case 0x04:
+        case 0x06: // 0x06 is (nForce 6XX integrated only)
+                return "Curie";
+        // 0x07 is unused/skipped
+        case 0x05: // First Tesla card was released before the nForce 6XX
+        case 0x08:
+        case 0x09:
+        case 0x0A:
+                return "Tesla";
+        // 0x0B is unused/skipped
+        case 0x0C:
+        case 0x0D:
+                return "Fermi";
+        case 0x0E:
+        case 0x0F:
+        case 0x11:
+                return "Kepler";
+        case 0x12:
+                return "Maxwell";
+        case 0x13:
+                return "Pascal";
+        case 0x14:
+        case 0x15: // Volta integrated
+                return "Volta";
+        case 0x16:
+                return "Turing";
+        case 0x17:
+                return "Ampere";
+        case 0x18:
+        case 0x19:
+                return "Hopper (?) or Lovelace (?)";
+        default:
+                if (arch < 0x19)
+                        return "[unknown historical architecture]";
+                else
+                        return "[future]";
+        }
+}
+typedef union {
+        // Fields as defined in the NVIDIA reference
+        struct {
+                uint32_t minor_revision:4;
+                uint32_t major_revision:4;
+                 uint32_t reserved:4;
+                 uint32_t padding0:8;
+                uint32_t implementation:4;
+                uint32_t architecture:5;
+                 uint32_t padding1:3;
+        } __attribute__((packed));
+        uint32_t raw;
+        // Arch << 4 + impl is also often used
+        struct {
+                 uint32_t padding2:20;
+                uint32_t chip_id:9;
+                 uint32_t padding3:3;
+        } __attribute__((packed));
+} mc_boot_0_t;
+enum DEVICE_INFO_TYPE {INFO_TYPE_NOT_VALID = 0, INFO_TYPE_DATA = 1, INFO_TYPE_ENUM = 2, INFO_TYPE_ENGINE_TYPE = 3};
+enum ENGINE_TYPES {
+        ENGINE_GRAPHICS = 0, // GRAPHICS [/compute]
+        ENGINE_COPY0 = 1, // [raw/physical] COPY #0
+        ENGINE_COPY1 = 2, // [raw/physical] COPY #1
+        ENGINE_COPY2 = 3, // [raw/physical] COPY #2
+        ENGINE_MSPDEC = 8, // Picture DECoder
+        ENGINE_MSPPP = 9, // [Video] Post Processing
+        ENGINE_MSVLD = 10, // [Video] Variable Length Decoder
+        ENGINE_MSENC = 11, // [Video] ENCoding
+        ENGINE_VIC = 12, // Video Image Compositor
+        ENGINE_SEC = 13, // SEquenCer [?]
+        ENGINE_NVENC0 = 14, // Nvidia Video ENCoder #0
+        ENGINE_NVENC1 = 15, // Nvidia Video ENCoder #1
+        ENGINE_NVDEC = 16, // Nvidia Video DECoder
+        ENGINE_IOCTRL = 18, // I/O ConTRoLler [of NVLINK at least]
+        ENGINE_LCE = 19, // Logical Copy Engine
+        ENGINE_GSP = 20, // Gpu System Processor
+        ENGINE_NVJPG = 21, // NVidia JPeG [Decoder] (Ampere+)
+};
+#define ENGINE_TYPES_LEN 22
+static const char* const ENGINE_TYPES_NAMES[ENGINE_TYPES_LEN] = {
+        "Graphics/Compute",
+        "COPY0",
+        "COPY1",
+        "COPY2",
+        "Unknown Engine ID#4",
+        "Unknown Engine ID#5",
+        "Unknown Engine ID#6",
+        "Unknown Engine ID#7",
+        "MSPDEC: Picture Decoder",
+        "MSPPP: Post Processing",
+        "MSVLD: Variable Length Decoder",
+        "MSENC: Encoder",
+        "VIC: Video Image Compositor",
+        "SEC: Sequencer",
+        "NVENC0: NVIDIA Video Encoder #0",
+        "NVENC1: NVIDIA Video Encoder #1",
+        "NVDEC: NVIDIA Video Decoder",
+        "Unknown Engine ID#17",
+        "IOCTRL: I/O Controller",
+        "LCE: Logical Copy Engine",
+        "GSP: GPU System Processor",
+        "NVJPG: NVIDIA JPEG Decoder",
+};
+/* GPU engine information and control register offsets
+  Each engine is described by one or more entries (terminated by an entry with
+  the `has_next_entry` flag unset) in the fixed-size PTOP_DEVICE_INFO table. A
+  typical device, such as the graphics/compute engine and any copy engines, are
+  described by three entries, one of each type.
+  The PTOP_DEVICE_INFO table is sparsely populated (entries of type
+  INFO_TYPE_NOT_VALID may be intermingled with valid entries), so any traversal
+  code should check all NV_PTOP_DEVICE_INFO__SIZE_1 entries and not terminate
+  upon reaching the first entry of INFO_TYPE_NOT_VALID.
+  INFO_TYPE          : Is this a DATA, ENUM, or ENGINE_TYPE table entry?
+  HAS_NEXT_ENTRY     : Does the following entry refer to the same engine?
+  == INFO_TYPE_DATA fields ==
+  PRI_BASE           : BAR0 base = (PRI_BASE << 12) aka 4k aligned.
+  INST_ID            : "Note that some instanced [engines] (such as logical copy
+                       engines aka LCE) share a PRI_BASE across all [engines] of
+                       the same engine type; such [engines] require an additional
+                       offset: instanced base = BAR0 base + stride * INST_ID.
+  FAULT_ID_IS_VALID  : Does this engine have its own bind point and fault ID
+                       with the MMU?
+  FAULT_ID           : "The MMU fault id used by this [engine]. These IDs
+                       correspond to the NV_PFAULT_MMU_ENG_ID define list."
+  == INFO_TYPE_ENUM fields ==
+  ENGINE_IS_VALID    : Is this engine a host engine?
+  ENGINE_ENUM        : "[T]he host engine ID for the current [engine] if it is
+                       a host engine, meaning Host can send methods to the
+                       engine. This id is used to index into any register array
+                       whose __SIZE_1 is equal to NV_HOST_NUM_ENGINES.  A given
+                       ENGINE_ENUM can be present for at most one device in the
+                       table.  Devices corresponding to all ENGINE_ENUM ids 0
+                       through NV_HOST_NUM_ENGINES - 1 must be present in the
+                       device info table."
+  RUNLIST_IS_VALID   : Is this engine a host engine with a runlist?
+  RUNLIST_ENUM       : "[T]he Host runlist ID on which methods for the current
+                       [engine] should be submitted... The runlist id is used to
+                       index into any register array whose __SIZE_1 is equal to
+                       NV_HOST_NUM_RUNLISTS. [Engines] corresponding to all
+                       RUNLIST_ENUM ids 0 through NV_HOST_NUM_RUNLISTS - 1 must
+                       be present in the device info table."
+  INTR_IS_VALID      : Does this device have an interrupt?
+  INTR_ENUM          : Interrupt ID for use with "the NV_PMC_INTR_*_DEVICE
+                       register bitfields."
+  RESET_IS_VALID     : Does this engine have a reset ID?
+  RESET_ENUM         : Reset ID for use indexing the "NV_PMC_ENABLE_DEVICE(i)
+                       and NV_PMC_ELPG_ENABLE_DEVICE(i) register bitfields."
+  == INFO_TYPE_ENGINE_TYPE fields ==
+  ENGINE_TYPE        : What type of engine is this? (see ENGINE_TYPES_NAMES) 
+  Support: Kepler, Maxwell, Pascal, Volta, Ampere
+  See dev_top.ref.txt of NVIDIA's open-gpu-doc for more info.
+*/
+#define NV_PTOP_DEVICE_INFO(i) (0x00022700+(i)*4)
+#define NV_PTOP_DEVICE_INFO__SIZE_1 64
+typedef union {
+        // DATA type fields
+        struct {
+                enum DEVICE_INFO_TYPE info_type:2;
+                bool fault_id_is_valid:1;
+                uint32_t fault_id:7;
+                 uint32_t padding0:2;
+                uint32_t pri_base:12;
+                 uint32_t padding1:2;
+                uint32_t inst_id:4;
+                uint32_t is_not_enum2:1;
+                bool has_next_entry:1;
+        } __attribute__((packed));
+        // ENUM type fields
+        struct {
+                 uint32_t padding2:2;
+                bool reset_is_valid:1;
+                bool intr_is_valid:1;
+                bool runlist_is_valid:1;
+                bool engine_is_valid:1;
+                 uint32_t padding3:3;
+                uint32_t reset_enum:5;
+                 uint32_t padding4:1;
+                uint32_t intr_enum:5;
+                 uint32_t padding5:1;
+                uint32_t runlist_enum:4;
+                 uint32_t padding6:1;
+                uint32_t engine_enum:4;
+                 uint32_t padding7:2;
+        } __attribute__((packed));
+        // ENGINE_TYPE type fields
+        struct {
+                 uint32_t padding8:2;
+                enum ENGINE_TYPES engine_type:29;
+                 uint32_t padding9:1;
+        } __attribute__((packed));
+        uint32_t raw;
+} ptop_device_info_t;
+#define NV_PTOP_SCAL_NUM_GPCS 0x00022430
+#define NV_PTOP_SCAL_NUM_TPC_PER_GPC 0x00022434
+#define NV_PTOP_SCAL_NUM_CES 0x00022444
+// PCE_MAP is Volta+ only
+#define NV_CE_PCE_MAP 0x00104028
+// GPC and TPC masks
+// Support: Maxwell+
+#define NV_FUSE_GPC 0x00021c1c
+#define NV_FUSE_TPC_FOR_GPC(i) (0x00021c38+(i)*4)
+/* Location of the 1Kb instance block with page tables for BAR1 and BAR2.
+  Support: Fermi+ (?), Pascal
+*/
+#define NV_PBUS_BAR1_BLOCK 0x00001704
+#define NV_PBUS_BAR2_BLOCK 0x00001714
+typedef union {
+        struct {
+                uint32_t ptr:28;
+                enum INST_TARGET target:2;
+                 uint32_t padding0:1;
+                bool is_virtual:1;
+        } __attribute__((packed));
+        uint32_t raw;
+        struct {
+                uint32_t map:30;
+                 uint32_t padding1:2;
+        } __attribute__((packed));
+} bar_config_block_t;
+/* BAR0 PRAMIN (Private RAM Instance) window configuration
+  BASE    : Base of window >> 16 in [TARGET] virtual address space
+  TARGET  : Which address space BASE points into
+  Note: This seems to be set to 0x0bff00000 - 0x0c0000000 at least sometimes
+  Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere
+*/
+#define NV_PBUS_BAR0_WINDOW 0x00001700
+#define NV_PRAMIN 0x00700000  // Goes until 0x00800000 (1MB window)
+#define NV_PRAMIN_LEN 0x00100000
+typedef union {
+        struct {
+                uint32_t base:24;
+                enum INST_TARGET target:2;
+                 uint32_t padding0:6;
+        } __attribute__((packed));
+        uint32_t raw;
+} bar0_window_t;
+// Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere
+#define NV_PRAMIN_PDB_CONFIG_OFF 0x200
+typedef union {
+        struct {
+                uint32_t target:2;
+                uint32_t vol:1;
+                 uint32_t padding0:1;
+                uint32_t fault_replay_tex:1;
+                uint32_t fault_replay_gcc:1;
+                 uint32_t padding1:4;
+                bool is_ver2:1;
+                bool is_64k_big_page:1;  // 128Kb otherwise
+                uint32_t page_dir_lo:20;
+                uint32_t page_dir_hi:32;
+        } __attribute__((packed));
+        uint64_t raw;
+} page_dir_config_t;
+/* Page directory entry
+  Note: Format changed with Pascal (how?)
+  Support: Pascal, Volta, Turing, Ampere
+*/
+// FIXME: PDE/PTEs are actually 64 bits =S
+// Important: Aperture keys are different with PDEs
+enum PD_TARGET {
+        PD_AND_TARGET_INVALID = 0,  // b000
+        PD_AND_TARGET_VID_MEM = 2,  // b010
+        PD_AND_TARGET_SYS_MEM_COHERENT = 4,  // b100
+        PD_AND_TARGET_SYS_MEM_NONCOHERENT = 6,  // b110
+        PTE_AND_TARGET_VID_MEM = 1,  // b001
+        PTE_AND_TARGET_PEER = 3,  // b011
+        PTE_AND_TARGET_SYS_MEM_COHERENT = 5,  // b101
+        PTE_AND_TARGET_SYS_MEM_NONCOHERENT = 7,  // b111
+};
+static inline char* pd_target_to_text(enum PD_TARGET t) {
+        switch (t) {
+                case PD_AND_TARGET_INVALID:
+                        return "INVALID";
+                case PD_AND_TARGET_VID_MEM:
+                case PTE_AND_TARGET_VID_MEM:
+                        return "VID_MEM";
+                case PTE_AND_TARGET_PEER:
+                        return "PEER";
+                case PD_AND_TARGET_SYS_MEM_COHERENT:
+                case PTE_AND_TARGET_SYS_MEM_COHERENT:
+                        return "SYS_MEM_COHERENT";
+                case PD_AND_TARGET_SYS_MEM_NONCOHERENT:
+                case PTE_AND_TARGET_SYS_MEM_NONCOHERENT:
+                        return "SYS_MEM_NONCOHERENT";
+                default:
+                        printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
+                        return NULL;
+        }
+}
+// PDE/PTE V2 type
+// Note: As the meaning of target (bits 2:1) changes depending on if the entry
+//       is a PTE or not, this combines them into a single target field to
+//       simplify comparisons.
+// Support: Pascal, Turing, Ampere
+typedef union {
+        // Page Directory Entry (PDE)
+        struct {
+                bool is_pte:1;
+                 uint32_t __target:2;
+                bool is_volatile:1;
+                 uint32_t padding1:4;
+                uint32_t addr:24;
+        } __attribute__((packed));
+        // Page Table Entry (PTE)
+        struct {
+                enum PD_TARGET target:3;
+                 uint32_t __is_volatile:1;
+                bool is_encrypted:1;
+                bool is_privileged:1;
+                bool is_readonly:1;
+                bool atomics_disabled:1;
+                 uint32_t __addr:24;
+        } __attribute__((packed));
+        uint32_t raw;
+} page_dir_entry_t;
+// PDE/PTE V1 types
+// Support: Fermi, Kepler, Maxwell
+enum V1_PD_TARGET {
+        PD_TARGET_INVALID = 0,
+        PD_TARGET_VID_MEM = 1,
+        PD_TARGET_SYS_MEM_COHERENT = 2,
+        PD_TARGET_SYS_MEM_NONCOHERENT = 3,
+};
+// Page Directory Entry (PDE)
+typedef union {
+// Large page fields
+        struct {
+// 0:32
+                enum V1_PD_TARGET target:2;
+                 uint32_t padding0:2;
+                uint64_t addr:28;  // May be wider?
+// 32:63
+                 uint32_t padding2:3;
+                uint32_t is_volatile:1; // Might have counted wrong?
+                 uint32_t padding3:28;
+        } __attribute__((packed));
+// Small page fields
+        struct {
+// 0:32
+                 uint32_t padding00:32;
+// 32:63
+                enum V1_PD_TARGET alt_target:2;
+                uint32_t alt_is_volatile:1; // Might have counted wrong?
+                 uint32_t padding03:1;
+                uint64_t alt_addr:28;
+        } __attribute__((packed));
+        uint64_t raw;
+} page_dir_entry_v1_t;
+// Page Table Entry (PTE)
+// Reconstructed from info in Jetson nvgpu driver
+typedef union {
+        struct {
+// 0:32
+                bool is_present:1;
+                bool is_privileged:1;
+                bool is_readonly:1;
+                 uint32_t padding0:1;
+                uint64_t addr:28;
+// 32:63
+                bool is_volatile:1;
+                enum INST_TARGET:2;
+                 uint32_t padding1:1;
+                uint32_t kind:8;
+                uint32_t comptag:17;
+                 uint32_t padding2:1;
+                bool is_read_disabled:1;
+                bool is_write_disabled:1;
+        } __attribute__((packed));
+        uint64_t raw;
+} page_tbl_entry_v1_t;
+//enum V0_PDE_TYPE {NOT_PRESENT = 0, PAGE_64K = 1, PAGE_16K = 2, PAGE_4K = 3};
+//enum V0_PDE_SIZE {PDE_SZ_128K = 0, PDE_SZ_32K = 1, PDE_SZ_16K = 2, PDE_SZ_8K = 3};
+//static const int V0_PDE_SIZE2NUM[4] = {128*1024, 32*1024, 16*1024, 8*1024};
+/* PDE V0 (nv50/Tesla)
+typedef union {
+        struct {
+                enum V1_PDE_TYPE type:2;
+                enum INST_TARGET target:2;
+                 uint32_t padding0:1;
+                enum V1_PDE_SIZE sublevel_size:2;
+                 uint32_t padding1:5;
+                uint32_t addr:28;
+                 uint32_t padding2:24;
+        } __attribute__((packed));
+        uint64_t raw;
+} page_dir_entry_v1_t;*/
+/* PTE V0 (nv50)
+typedef union {
+        struct {
+                bool is_present:1;
+                 uint32_t padding3:2;
+                bool is_readonly:1;
+                enum INST_TARGET target:2;
+                bool is_privileged:1;
+                uint32_t contig_blk_sz:3;
+                 uint32_t padding4:2;
+                uint32_t addr:28;
+                uint32_t storage_type:7;  // ???
+                uint32_t compression_mode:2;  // ???
+                uint32_t compression_tag:12;  // ???
+                bool is_long_partition_cycle:1;  // ???
+                bool is_encrypted:1;
+                 uint32_t padding5:1;
+        } __attribute__((packed));
+        uint64_t raw;
+} page_tbl_entry_v1_t;*/
 // TODO(jbakita): Maybe put the above GPU types in a different file.
-#define for_chan_in_tsg(chan, tsg) \
+#define NV_PCI_VENDOR 0x10de
-        for (chan = (struct runlist_chan*)(tsg + 1); \
+struct nvdebug_state {
-             (void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \
+        // Pointer to the mapped base address of the GPU control registers (obtained
-             chan++)
+        // via ioremap() originally). For embedded GPUs, we extract this from their
+        // struct nvgpu_os_linux. For discrete GPUs, we create our own mapping of
+        // BAR0 with pci_iomap(). Access via nvgpu_readl/writel functions.
+        void __iomem *regs;
+        // Depending on the architecture, BAR2 or BAR3 are used to access PRAMIN
+        union {
+                void __iomem *bar2;
+                void __iomem *bar3;
+        };
+        int chip_id;
+        // Additional state from the built-in driver. Only set iff
+        // chip_id == NV_CHIP_ID_GV11B
+        struct gk20a *g;
+        // Pointer to PCI device needed for pci_iounmap
+        struct pci_dev *pcid;
+};
+/*const struct runlist_funcs {
+        u8 size;
+        enum ENTRY_TYPE (*entry_type)(struct nvdebug_state *, void *);
+        uint32_t (*chid)(struct nvdebug_state *, void *);
+        uint32_t (*inst_ptr_lo)(struct nvdebug_state *, void *);
+        enum INST_TARGET (*inst_target)(struct nvdebug_state *, void *):
+        uint32_t (*tsgid)(struct nvdebug_state *, void *);
+        uint32_t (*timeslice_scale)(struct nvdebug_state *, void *);
+        uint32_t (*timeslice_timeout)(struct nvdebug_state *, void *);
+        uint32_t (*tsg_length)(struct nvdebug_state *, void *);
+};*/
+// This disgusting macro is a crutch to work around the fact that runlists were
+// different prior to Volta.
+#define VERSIONED_RL_ACCESSOR(_ENTRY_TYPE, type, prop) \
+        __attribute__((unused)) \
+        static type (prop)(const struct nvdebug_state *g, const void *raw) { \
+                if (g->chip_id > NV_CHIP_ID_VOLTA) { \
+                        const struct gv100_runlist_ ## _ENTRY_TYPE *entry = (struct gv100_runlist_ ## _ENTRY_TYPE*)raw; \
+                        return entry->prop; \
+                } else if (g->chip_id > NV_CHIP_ID_KEPLER) { \
+                        const struct gk110_runlist_ ## _ENTRY_TYPE *entry = (struct gk110_runlist_ ## _ENTRY_TYPE*)raw; \
+                        return entry->prop; \
+                } else { \
+                        printk(KERN_WARNING "[nvdebug] " #prop " unavailable on GPU ID %x, which is older than Kepler.\n", g->chip_id); \
+                        return (type)0; \
+                } \
+        }
+VERSIONED_RL_ACCESSOR(chan, uint32_t, chid);
+VERSIONED_RL_ACCESSOR(chan, uint32_t, inst_ptr_lo);
+VERSIONED_RL_ACCESSOR(chan, enum INST_TARGET, inst_target);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, tsgid);
+VERSIONED_RL_ACCESSOR(tsg, enum ENTRY_TYPE, entry_type);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, timeslice_scale);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, timeslice_timeout);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, tsg_length);
-#define next_tsg(tsg) \
-        (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length
+#define NV_RL_ENTRY_SIZE(g) \
+         ((g)->chip_id >= NV_CHIP_ID_VOLTA ? sizeof(struct gv100_runlist_tsg) : sizeof(struct gk110_runlist_tsg))
+#define for_chan_in_tsg(g, chan, tsg) \
+        for (chan = (typeof(chan))(((u8*)tsg) + NV_RL_ENTRY_SIZE(g)); \
+             (u8*)chan < ((u8*)tsg) + (1 + tsg_length(g, tsg)) * NV_RL_ENTRY_SIZE(g); \
+             chan = (typeof(chan))(((u8*)chan) + NV_RL_ENTRY_SIZE(g)))
+#define next_tsg(g, tsg) \
+        (typeof(tsg))((u8*)(tsg) + NV_RL_ENTRY_SIZE(g) * (tsg_length(g, tsg) + 1))
 struct runlist_iter {
-        struct entry_tsg *curr_tsg;
+        // Pointer to either a TSG or channel entry (they're the same size)
+        void *curr_entry;
+        // This should be set to tsg_length when a TSG is reached, and
+        // decremented as each subsequent channel is printed. This allows us to
+        // track which channel are and are not part of the TSG.
+        int channels_left_in_tsg;
+        // Total runlist length, etc
        runlist_info_t rl_info;
 };
+#define NVDEBUG_MAX_DEVICES 8
+extern struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
 // Defined in runlist.c
-struct gk20a* get_live_gk20a(void);
+int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl_iter);
-int get_runlist_iter(struct runlist_iter *rl_iter);
+int preempt_tsg(struct nvdebug_state *g, uint32_t tsg_id);
-int preempt_tsg(uint32_t tsg_id);
+// Defined in mmu.c
+uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr);
+void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy);
+uint64_t search_page_directory(
+        struct nvdebug_state *g,
+        void __iomem *pde_offset,
+        void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+        uint64_t addr_to_find);
+uint64_t search_v1_page_directory(
+        struct nvdebug_state *g,
+        void __iomem *pde_offset,
+        void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+        uint64_t addr_to_find);
 static inline struct gk20a *get_gk20a(struct device *dev) {
        // XXX: Only works because gk20a* is the first member of gk20a_platform
        return *((struct gk20a**)dev_get_drvdata(dev));
 }
-// Functionally identical to nvgpu_readl()
+// We us the data field of the proc_dir_entry ("PDE" in this function) to store
+// our index into the g_nvdebug_state array
+static inline int seq2gpuidx(struct seq_file *s) {
+        const struct file *f = s->file;
+        return (uintptr_t)PDE_DATA(file_inode(f));
+}
+static inline int file2gpuidx(const struct file *f) {
+        return (uintptr_t)PDE_DATA(file_inode(f));
+}
+static inline int file2parentgpuidx(const struct file *f) {
+        // Should be safe to call on ProcFS entries, as our parent should (?)
+        // still exist if we're called. If not, there are worse races in this
+        // module.
+        return (uintptr_t)PDE_DATA(file_dentry(f)->d_parent->d_inode);
+}
+#define gk20a_regs(gk20a) (container_of(gk20a, struct nvgpu_os_linux, g)->regs)
+// Similar to nvgpu_readl()
 // (except we don't try to resolve situations where regs is NULL)
-static inline u32 nvdebug_readl(struct gk20a* g, u32 r) {
+static inline u32 nvdebug_readl(struct nvdebug_state *s, u32 r) {
-        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
+        if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
-        if (unlikely(!g_os->regs)) {
+                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
-                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
+                return -1;
-                return -1;
+        }
-        }
+        return readl(s->regs + r);
-        return readl(g_os->regs + r);
 }
 // quadword version of nvdebug_readl()
-static inline u64 nvdebug_readq(struct gk20a* g, u32 r) {
+static inline u64 nvdebug_readq(struct nvdebug_state *s, u32 r) {
-        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
+        u64 ret;
-        u64 ret;
+        if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
-        if (unlikely(!g_os->regs)) {
+                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
-                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
+                return -1;
-                return -1;
+        }
-        }
        // readq seems to always return the uppermost 32 bits as 0, so workaround with readl
-        ret = readl(g_os->regs + r);
+        ret = readl(s->regs + r);
-        ret |= ((u64)readl(g_os->regs + r + 4)) << 32;
+        ret |= ((u64)readl(s->regs + r + 4)) << 32;
        return ret;
 }
-// Functionally identical to nvgpu_writel()
+// Similar to nvgpu_writel()
-static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {
+static inline void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) {
-        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
+        if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
-        if (unlikely(!g_os->regs)) {
+                printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
+                return;
+        }
+        writel_relaxed(v, s->regs + r);
+        wmb();
+}
+// quadword version of nvdebug_writel()
+// XXX: This probably doesn't work XXX: Untested
+static inline void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) {
+        if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
                printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
                return;
        }
-        writel_relaxed(v, g_os->regs + r);
+        writeq_relaxed(v, s->regs + r);
        wmb();
 }
diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index 0854b8b..695b5fd 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -2,64 +2,282 @@
 * SPDX-License-Identifier: MIT
 */
-/* TODO
- * - Add sysfs trigger for a preemption
- */
 #include <linux/device.h>  // For struct device, bus_find_device*(), struct bus_type
+#include <linux/interrupt.h> // For hooking the nvidia driver interrupts
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/proc_fs.h> // So we can set up entries in /proc
+#include <linux/pci.h>  // For PCI device scanning
+#include <linux/proc_fs.h>  // So we can set up entries in /proc
 #include "nvdebug.h"
+#include "stubs.h"
-// LIAR. But without this we can't use GPL-only exported symbols like
+// MIT is GPL-compatible. We need to be GPL-compatible for symbols like
 // platform_bus_type or bus_find_device_by_name...
-MODULE_LICENSE("GPL");
+MODULE_LICENSE("Dual MIT/GPL");
 MODULE_AUTHOR("Joshua Bakita");
 MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");
-MODULE_SOFTDEP("pre: nvgpu"); // We only support the Jetson boards for now
 extern const struct file_operations runlist_file_ops;
 extern const struct file_operations preempt_tsg_file_ops;
 extern const struct file_operations disable_channel_file_ops;
 extern const struct file_operations enable_channel_file_ops;
 extern const struct file_operations switch_to_tsg_file_ops;
+extern const struct file_operations device_info_file_ops;
+extern const struct file_operations nvdebug_read_reg32_file_ops;
+// Bus types are global symbols in the kernel
+extern struct bus_type platform_bus_type;
+struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
+unsigned int g_nvdebug_devices = 0;
+// TEMP
+irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) {
+        printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num);
+        return IRQ_NONE; // We don't actually handle any interrupts. Pass them on.
+}
+// Find any and all NVIDIA GPUs in the system
+// Note: This function fails if any of them are in a bad state
+int probe_and_cache_device(void) {
+        // platform bus (SoC) iterators
+        struct device *dev = NULL;
+        struct device *temp_dev;
+        // PCI search iterator and search query
+        struct pci_dev *pcid = NULL;
+        // This query pattern is mirrored off nouveau
+        struct pci_device_id query = {
+                .vendor = NV_PCI_VENDOR,  // Match NVIDIA devices
+                .device = PCI_ANY_ID,
+                .subvendor = PCI_ANY_ID,
+                .subdevice = PCI_ANY_ID,
+                .class_mask = 0xff << 16,
+                .class = PCI_BASE_CLASS_DISPLAY << 16,  // Match display devs
+        };
+        int i = 0;
+        // Search the platform bus for the first device that matches our name
+        // Search for GV10B (Jetson Xavier)
+        while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b")))
+                dev = temp_dev;
+        // Search for GP10B (Jetson TX2)
+        while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gp10b")))
+                dev = temp_dev;
+        // TODO: Support other platform bus devices (gk20a, gm20b)
+        if (dev) {
+                struct nvgpu_os_linux *l;
+                mc_boot_0_t ids;
+                g_nvdebug_state[i].g = get_gk20a(dev);
+                l = container_of(g_nvdebug_state[i].g, struct nvgpu_os_linux, g);
+                g_nvdebug_state[i].regs = l->regs;
+                if (!g_nvdebug_state[i].regs)
+                        return -EADDRNOTAVAIL;
+                ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
+                if (ids.raw == -1)
+                        return -EADDRNOTAVAIL;
+                g_nvdebug_state[i].chip_id = ids.chip_id;
+                printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on platform bus and initialized.",
+                       ids.chip_id, ARCH2NAME(ids.architecture));
+                i++;
+        }
+        // Search the PCI bus and iterate through all matches
+        // FIXME: State rollback
+        while ((pcid = pci_get_dev_by_id(&query, pcid)) && i < NVDEBUG_MAX_DEVICES) {
+                mc_boot_0_t ids;
+                g_nvdebug_state[i].g = NULL;
+                // Map BAR0 (GPU control registers)
+                g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0);
+                if (!g_nvdebug_state[i].regs) {
+                        pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n");
+                        return -EADDRNOTAVAIL;
+                }
+                // Map BAR3 (CPU-accessible mappings of GPU DRAM)
+                g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, 0);
+                // Try mapping only the lower half of BAR3 on fail
+                // (vesafb may map the top half for display)
+                if (!g_nvdebug_state[i].bar3)
+                        g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2);
+                g_nvdebug_state[i].pcid = pcid;
+                ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
+                if (ids.raw == -1) {
+                        pci_err(pcid, "[nvdebug] Unable to read config from Master Controller on this GPU\n");
+                        return -EADDRNOTAVAIL;
+                }
+                g_nvdebug_state[i].chip_id = ids.chip_id;
+                printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.",
+                       ids.chip_id, ARCH2NAME(ids.architecture));
+                // TEMP
+                if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) {
+                        printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n");
+                }
+                i++;
+        }
+        // Return the number of devices we found
+        if (i > 0)
+                return i;
+        return -ENODEV;
+}
+// Create files `/proc/gpu#/runlist#`, world readable
+int create_runlist_files(int device_id, struct proc_dir_entry *dir) {
+        ptop_device_info_t info;
+        struct proc_dir_entry *rl_entry;
+        int i, rl_id;
+        char runlist_name[12];
+        int max_rl_id = 0; // Always at least one runlist
+        // Figure out how many runlists there are by checking the device info
+        // registers. Runlists are always numbered sequentially, so we just have
+        // to find the highest-valued one and add 1 to get the number of runlists.
+        for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1; i++) {
+                info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO(i));
+                if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid)
+                        continue;
+                if (info.runlist_enum > max_rl_id)
+                        max_rl_id = info.runlist_enum;
+        }
+        // Create files to read each runlist. The read handling code looks at the
+        // PDE_DATA associated with the file to determine what the runlist ID is.
+        for (rl_id = 0; rl_id <= max_rl_id; rl_id++) {
+                snprintf(runlist_name, 12, "runlist%d", rl_id);
+                rl_entry = proc_create_data(
+                        runlist_name, 0444, dir, &runlist_file_ops,
+                        (void*)(uintptr_t)rl_id);
+                if (!rl_entry)
+                        return -ENOMEM;
+        }
+        return 0;
+}
+// Create files /proc/gpu#
+// TODO: Don't run this on unsupported GPUs
+int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
+        char file_name[20];
+        int i;
+        struct proc_dir_entry *gpc_tpc_mask_entry;
+        // Get a bitmask of which GPCs are disabled
+        uint32_t gpcs_mask = nvdebug_readl(&g_nvdebug_state[device_id], NV_FUSE_GPC);
+        // Get maximum number of enabled GPCs for this chip
+        uint32_t max_gpcs = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_SCAL_NUM_GPCS);
+        // For each enabled GPC, expose a mask of disabled TPCs
+        for (i = 0; i < max_gpcs; i++) {
+                // Do nothing if GPC is disabled
+                if ((1 << i) & gpcs_mask)
+                        continue;
+                // If GPC is enabled, create an entry to read disabled TPCs mask
+                snprintf(file_name, 20, "gpc%d_tpc_mask", i);
+                gpc_tpc_mask_entry = proc_create_data(
+                        file_name, 0444, dir, &nvdebug_read_reg32_file_ops,
+                        (void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC(i));
+                if (!gpc_tpc_mask_entry)
+                        return -ENOMEM;
+        }
+        return 0;
+}
 int __init nvdebug_init(void) {
-        struct proc_dir_entry *rl_entry, *preempt_entry, *disable_channel_entry,
+        struct proc_dir_entry *dir, *preempt_entry, *disable_channel_entry,
-                              *enable_channel_entry, *switch_to_tsg_entry;
+                              *enable_channel_entry, *switch_to_tsg_entry, *device_info_entry,
-        // Create file `/proc/preempt_tsg`, world readable
+                              *num_gpcs_entry;
-        rl_entry = proc_create("runlist", 0444, NULL, &runlist_file_ops);
+        int rl_create_err, tpc_masks_create_err;
-        // Create file `/proc/preempt_tsg`, world writable
+        // Check that an NVIDIA GPU is present and initialize g_nvdebug_state
-        preempt_entry = proc_create("preempt_tsg", 0222, NULL, &preempt_tsg_file_ops);
+        int res = probe_and_cache_device();
-        // Create file `/proc/disable_channel`, world writable
+        if (res < 0)
-        disable_channel_entry = proc_create("disable_channel", 0222, NULL, &disable_channel_file_ops);
+                return res;
-        // Create file `/proc/enable_channel`, world writable
+        g_nvdebug_devices = res;
-        enable_channel_entry = proc_create("enable_channel", 0222, NULL, &enable_channel_file_ops);
+        // Create seperate ProcFS directories for each gpu
-        // Create file `/proc/switch_to_tsg`, world writable
+        while (res--) {
-        switch_to_tsg_entry = proc_create("switch_to_tsg", 0222, NULL, &switch_to_tsg_file_ops);
+                char device_id_str[7];
-        // ProcFS entry creation only fails if out of memory
+                uintptr_t device_id = res;  // This is uintptr as we abuse the *data field on proc_dir_entry to store the GPU id
-        if (!rl_entry || !preempt_entry || !disable_channel_entry || !enable_channel_entry || !switch_to_tsg_entry) {
+                // Create directory /proc/gpu# where # is the GPU number
-                remove_proc_entry("runlist", NULL);
+                snprintf(device_id_str, 7, "gpu%ld", device_id);
-                remove_proc_entry("preempt_tsg", NULL);
+                if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id)))
-                remove_proc_entry("disable_channel", NULL);
+                        goto out_nomem;
-                remove_proc_entry("enable_channel", NULL);
+                // Create files `/proc/gpu#/runlist#`, world readable
-                remove_proc_entry("switch_to_tsg", NULL);
+                rl_create_err = create_runlist_files(device_id, dir);
-                printk(KERN_ERR "[nvdebug] Unable to initialize procfs entries!\n");
+                // Create files `/proc/gpu#/gpc#_tpc_mask`, world readable
-                return -ENOMEM;
+                tpc_masks_create_err = create_tpc_mask_files(device_id, dir);
+                // Create file `/proc/gpu#/preempt_tsg`, world writable
+                preempt_entry = proc_create_data(
+                        "preempt_tsg", 0222, dir, &preempt_tsg_file_ops,
+                        (void*)device_id);
+                // Create file `/proc/gpu#/disable_channel`, world writable
+                disable_channel_entry = proc_create_data(
+                        "disable_channel", 0222, dir, &disable_channel_file_ops,
+                        (void*)device_id);
+                // Create file `/proc/gpu#/enable_channel`, world writable
+                enable_channel_entry = proc_create_data(
+                        "enable_channel", 0222, dir, &enable_channel_file_ops,
+                        (void*)device_id);
+                // Create file `/proc/gpu#/switch_to_tsg`, world writable
+                switch_to_tsg_entry = proc_create_data(
+                        "switch_to_tsg", 0222, dir, &switch_to_tsg_file_ops,
+                        (void*)device_id);
+                // Create file `/proc/gpu#/device_info`, world readable
+                device_info_entry = proc_create_data(
+                        "device_info", 0444, dir, &device_info_file_ops,
+                        (void*)device_id);
+                // Create file `/proc/gpu#/num_gpcs`, world readable
+                num_gpcs_entry = proc_create_data(
+                        "num_gpcs", 0444, dir, &nvdebug_read_reg32_file_ops,
+                        (void*)NV_PTOP_SCAL_NUM_GPCS);
+                // Create file `/proc/gpu#/num_tpc_per_gpc`, world readable
+                num_gpcs_entry = proc_create_data(
+                        "num_tpc_per_gpc", 0444, dir, &nvdebug_read_reg32_file_ops,
+                        (void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC);
+                // Create file `/proc/gpu#/num_ces`, world readable
+                num_gpcs_entry = proc_create_data(
+                        "num_ces", 0444, dir, &nvdebug_read_reg32_file_ops,
+                        (void*)NV_PTOP_SCAL_NUM_CES);
+                // Create file `/proc/gpu#/num_ces`, world readable
+                num_gpcs_entry = proc_create_data(
+                        "gpc_mask", 0444, dir, &nvdebug_read_reg32_file_ops,
+                        (void*)NV_FUSE_GPC);
+                // In both nouveau and nvgpu, the PCE_MAP register is only available on Volta+
+                if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_VOLTA) {
+                        // TODO: Redo to num_pces
+                        // Create file `/proc/gpu#/pce_map`, world readable
+                        num_gpcs_entry = proc_create_data(
+                                "pce_map", 0444, dir, &nvdebug_read_reg32_file_ops,
+                                (void*)NV_CE_PCE_MAP);
+                }
+                // ProcFS entry creation only fails if out of memory
+                if (rl_create_err || tpc_masks_create_err || !preempt_entry ||
+                    !disable_channel_entry || !enable_channel_entry ||
+                    !switch_to_tsg_entry || !device_info_entry || !num_gpcs_entry)
+                        goto out_nomem;
        }
+        // (See Makefile if you want to know the origin of GIT_HASH.)
        printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n");
        return 0;
+out_nomem:
+        // Make sure to clear all ProcFS directories on error
+        while (res < g_nvdebug_devices) {
+                char device_id_str[7];
+                snprintf(device_id_str, 7, "gpu%d", res);
+                remove_proc_subtree(device_id_str, NULL);
+                res++;
+        }
+        return -ENOMEM;
 }
 static void __exit nvdebug_exit(void) {
-        remove_proc_entry("runlist", NULL);
+        struct nvdebug_state *g;
-        remove_proc_entry("preempt_tsg", NULL);
+        // Deinitialize each device
-        remove_proc_entry("disable_channel", NULL);
+        while (g_nvdebug_devices--) {
-        remove_proc_entry("enable_channel", NULL);
+                // Remove procfs directory
-        remove_proc_entry("switch_to_tsg", NULL);
+                char device_id[7];
-        printk(KERN_INFO "[nvdebug] Exiting...\n");
+                snprintf(device_id, 7, "gpu%d", g_nvdebug_devices);
+                remove_proc_subtree(device_id, NULL);
+                // Free BAR mappings
+                g = &g_nvdebug_state[g_nvdebug_devices];
+                if (g && g->regs)
+                        pci_iounmap(g->pcid, g->regs);
+                if (g && g->bar2)
+                        pci_iounmap(g->pcid, g->bar2);
+                // TEMP
+                free_irq(g->pcid->irq, g->pcid);
+                printk(KERN_INFO "[nvdebug] Chip ID %x deinitialized.", g->chip_id);
+        }
+        printk(KERN_INFO "[nvdebug] Module exit complete.\n");
 }
 module_init(nvdebug_init);
diff --git a/runlist.c b/runlist.c
index c8ff99f..94be18e 100644
--- a/runlist.c
+++ b/runlist.c
@@ -1,122 +1,127 @@
-#include <linux/device.h>  // For struct device, bus_find_device*(), struct bus_type
-//#include <linux/iommu.h>  // For struct iommu_domain
 #include <linux/kernel.h>  // Kernel types
-#include <asm/io.h>
 #include "nvdebug.h"
-// Bus types are global symbols in the kernel
-extern struct bus_type platform_bus_type;
-struct gk20a* get_live_gk20a(void) {
-        struct device *dev = NULL;
-        struct device *temp_dev;
-        struct gk20a *g;
-        struct nvgpu_os_linux *l;
-        // Get the last device that matches our name
-        while ((temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b"))) {
-                dev = temp_dev;
-                printk(KERN_INFO "[nvdebug] Found a matching device %s\n", dev_name(dev));
-        }
-        if (!dev)
-                return NULL;
-        g = get_gk20a(dev);
-        // The address pointed to `regs` + NV_PFIFO_RUNLIST_BASE seems to not be:
-        // - A GPU address (type is sysmem_coherent)
-        // - A physical address (dereferencing after ioremap crashes)
-        // - A kernel virtual address (dereferencing segfaults)
-        // So maybe it's some sort of custom thing? This is an address that the GPU
-        // can use, so it would make most sense for it to be a physical address.
-        //
-        // BUT, it can't possibly be a physical address, as it would refer to an
-        // address greater than the maximum one on our system (by a lot!).
-        // Maybe I'm reading the runlist base wrong?
-        // Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual
-        // address! So, what's this I/O address space? All I know is that it's what
-        // nvgpu_mem_get_addr() returns. That function returns the result of either:
-        // - gpu_phys_addr which is  __nvgpu_sgl_phys on our platform which (?)
-        //   converts an IPA to a PA?
-        // - nvgpu_mem_iommu_translate
-        //
-        // The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which
-        // returns SYSMEM.
-        //
-        // To convert a physical address to a IOMMU address, we add a bit
-        //
-        // BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working
-        // before because the GPU had simply gone to sleep and invalidated its
-        // register state, so nvgpu_readl() was simply returning garbage.
-        l = container_of(g, struct nvgpu_os_linux, g);
-        if (!l->regs)
-                return NULL;
-        return g;
-}
 /* Get runlist head and info (incl. length)
   @param rl_iter Location at which to store output
+   @param rl_id   Which runlist to obtain?
 */
-int get_runlist_iter(struct runlist_iter *rl_iter) {
+int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl_iter) {
-        struct entry_tsg head;
+        runlist_base_t rl_base;
-        runlist_base_t rl_base;
+        runlist_info_t rl_info;
-        runlist_info_t rl_info;
+        u64 runlist_iova;
-        u64 runlist_iova;
+        *rl_iter = (struct runlist_iter){0};
-        struct gk20a *g = get_live_gk20a();
+        rl_base.raw = nvdebug_readl(g, NV_PFIFO_ENG_RUNLIST_BASE(rl_id));
-        if (!g)
+        // Check that reads are working
+        if (rl_base.raw == -1)
                return -EIO;
-        rl_base.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST_BASE);
+        // The address pointed to `regs` + NV_PFIFO_RUNLIST_BASE seems to not be:
-        rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST);
+        // - A GPU address (type is sysmem_coherent)
-        runlist_iova = ((u64)rl_base.ptr) << 12;
+        // - A physical address (dereferencing after ioremap crashes)
-        printk(KERN_INFO "[nvdebug] Runlist ptr: %x, type: %d, raw: %x, IOVA: %px\n",
+        // - A kernel virtual address (dereferencing segfaults)
-                rl_base.ptr, rl_base.type, rl_base.raw, (void*)runlist_iova);
+        // So maybe it's some sort of custom thing? This is an address that the GPU
-        // TODO: Support reading video memory
+        // can use, so it would make most sense for it to be a physical address.
-        if (rl_base.type == TARGET_VID_MEM) {
+        //
-                printk(KERN_ERR "[nvdebug] Runlist is located in video memory. Access to video memory is unimplemented.");
+        // BUT, it can't possibly be a physical address, as it would refer to an
-                return -ENOTSUPP;
+        // address greater than the maximum one on our system (by a lot!).
+        // Maybe I'm reading the runlist base wrong?
+        // Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual
+        // address! So, what's this I/O address space? All I know is that it's what
+        // nvgpu_mem_get_addr() returns. That function returns the result of either:
+        // - gpu_phys_addr which is  __nvgpu_sgl_phys on our platform which (?)
+        //   converts an IPA to a PA?
+        // - nvgpu_mem_iommu_translate
+        //
+        // The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which
+        // returns SYSMEM.
+        //
+        // To convert a physical address to a IOMMU address, we add a bit
+        //
+        // BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working
+        // before because the GPU had simply gone to sleep and invalidated its
+        // register state, so nvgpu_readl() was simply returning garbage.
+        rl_info.raw = nvdebug_readl(g, NV_PFIFO_ENG_RUNLIST(rl_id));
+        runlist_iova = ((u64)rl_base.ptr) << 12;
+        printk(KERN_INFO "[nvdebug] Runlist %d @ %llx in %s (config raw: %x)\n",
+               rl_id, runlist_iova, target_to_text(rl_base.target), rl_base.raw);
+        printk(KERN_INFO "[nvdebug] Runlist length %d, ID %d\n", rl_info.len, rl_info.id);
+        // Return early on an empty runlist
+        if (!rl_info.len)
+                return 0;
+        // If the runlist is in VID_MEM, search the BAR2/3 page tables for a mapping
+        if (rl_base.target == TARGET_VID_MEM) {
+                printk(KERN_WARNING "[nvdebug] Runlist is located in video memory. Access to video memory is experimental.");
+                bar_config_block_t bar1_block, bar2_block;
+                bar1_block.raw = nvdebug_readl(g, NV_PBUS_BAR1_BLOCK);
+                printk(KERN_INFO "[nvdebug] BAR1 inst block @ %llx in %s's %s address space.\n", ((u64)bar1_block.ptr) << 12, target_to_text(bar1_block.target), bar1_block.is_virtual ? "virtual" : "physical");
+                bar2_block.raw = nvdebug_readl(g, NV_PBUS_BAR2_BLOCK);
+                printk(KERN_INFO "[nvdebug] BAR2 inst block @ %llx in %s's %s address space.\n", ((u64)bar2_block.ptr) << 12, target_to_text(bar2_block.target), bar1_block.is_virtual ? "virtual" : "physical");
+                uint32_t bar_inst_pramin_offset = vram2PRAMIN(g, (uint64_t)bar2_block.ptr << 12);
+                if (!bar_inst_pramin_offset) {
+                        printk(KERN_WARNING "[nvdebug] Unable to find instance block for BAR2/3 in the current NV_PRAMIN window. VRAM inaccessible.\n");
+                        return -EOPNOTSUPP;
+                }
+                /* TODO: Support BAR1?
+                bar_inst_pramin_offset = vram2PRAMIN(g, bar1_block.ptr << 12);
+                if (!bar_inst_pramin_offset) {
+                        printk(KERN_WARNING "[nvdebug] Unable to find instance block for BAR1 in the current NV_PRAMIN window. VRAM inaccessible.\n");
+                        return -EOPNOTSUPP;
+                }*/
+                // Instance blocks (size == 1kb) contain many things, but we only care about
+                // the section which describes the location of the page directory (page table)
+                uint32_t bar_pdb_config_pramin_offset = bar_inst_pramin_offset + NV_PRAMIN_PDB_CONFIG_OFF;
+                page_dir_config_t pd_config;
+                pd_config.raw = nvdebug_readq(g, bar_pdb_config_pramin_offset + NV_PRAMIN);
+                uint64_t bar_pdb_vram_addr = pd_config.page_dir_hi;
+                bar_pdb_vram_addr <<= 20;
+                bar_pdb_vram_addr |= pd_config.page_dir_lo;
+                bar_pdb_vram_addr <<= 12;
+                printk(KERN_INFO "[nvdebug] BAR2 PDB @ %llx in %s of version %s (config raw: %llx)\n", bar_pdb_vram_addr, target_to_text(pd_config.target), pd_config.is_ver2 ? "2" : "1", pd_config.raw);
+                // TODO: SYSMEM support for page table location
+                if (pd_config.target != TARGET_VID_MEM) {
+                        printk(KERN_WARNING "[nvdebug] BAR2 PDB is in an unsupported location.\n");
+                        return -EOPNOTSUPP;
+                }
+                uint32_t bar_pdb_pramin_offset = vram2PRAMIN(g, bar_pdb_vram_addr);
+                if (!bar_pdb_pramin_offset) {
+                        printk(KERN_WARNING "[nvdebug] Unable to find page directory BAR2/3 in the current NV_PRAMIN window. VRAM inaccessible.\n");
+                        return -EOPNOTSUPP;
+                }
+                uint64_t runlist_bar_vaddr;
+                if (pd_config.is_ver2)
+                        runlist_bar_vaddr = search_page_directory(g, g->regs + NV_PRAMIN + bar_pdb_pramin_offset, phy2PRAMIN, runlist_iova);
+                else
+                        runlist_bar_vaddr = search_v1_page_directory(g, g->regs + NV_PRAMIN + bar_pdb_pramin_offset, phy2PRAMIN, runlist_iova);
+                if (!runlist_bar_vaddr) {
+                        printk(KERN_WARNING "[nvdebug] Unable to find runlist mapping in BAR2/3 page tables.\n");
+                        return -EOPNOTSUPP;
+                }
+                printk(KERN_INFO "[nvdebug] Runlist @ %llx in BAR2 virtual address space.\n", runlist_bar_vaddr);
+                /* XXX: Old test code
+                uint32_t bar2_pd_pramin_offset = vram_to_pramin_off(bar2_pd);
+                //walk_pd_subtree(bar2_pd_pramin_offset);
+                uint64_t runlist_bar2_vaddr = search_pd_subtree(bar2_pd_pramin_offset, runlist_iova);
+                page_dir_entry_t pde_0;
+                pde_0.raw = nvdebug_readl(g, NV_PRAMIN + bar2_pd_pramin_offset);
+                uint32_t pde_1 = nvdebug_readl(g, NV_PRAMIN + vram_to_pramin_off(((u64)pde_0.addr) << 12));
+                uint64_t pde_bar2_vaddr = search_pd_subtree(bar2_pd_pramin_offset, ((u64)pde_0.addr) << 12);
+                uint32_t pde_2 = readl(g->bar3 + pde_bar2_vaddr);
+                printk(KERN_INFO "[nvdebug] PDE0 via PRAMIN: %x, via BAR3: %x\n", pde_1, pde_2);
+                */
+                if (!g->bar3) {
+                        printk(KERN_WARNING "[nvdebug] BAR2/3 not mapped.\n");
+                        return -ENODEV;
+                }
+                rl_iter->curr_entry = g->bar2 + runlist_bar_vaddr;
+        } else {
+                // Directly access the runlist if stored in SYS_MEM (physically addressed)
+                rl_iter->curr_entry = phys_to_virt(runlist_iova);
        }
-        // Segfaults
+        rl_iter->rl_info = rl_info;
-        //u32 attempted_read = ioread32(runlist_iova);
+        return 0;
-        //printk(KERN_INFO "[nvdebug] first word of runlist: %0x\n", attempted_read);
-        // Errors out
-        //u32* virt_rt_addr = ioremap(phys_rl_addr, sizeof(struct entry_tsg));
-        //printk(KERN_INFO "[nvdebug] Runlist virt_addr: %px\n", virt_rt_addr);
-        /* Overcomplicated?
-        struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
-        if (!domain) {
-                printk(KERN_INFO "[nvdebug] No IOMMU domain!\n");
-                return -EIO;
-        }
-        u64 phys_addr = platform_bus_type.iommu_ops->iova_to_phys(domain, runlist_iova);
-        printk(KERN_INFO "[nvdebug] Runlist PA: %px\n", phys_addr);
-        */
-        printk(KERN_INFO "[nvdebug] Runlist phys_to_virt:   %px\n", (void*)phys_to_virt(runlist_iova));
-        printk(KERN_INFO "[nvdebug] Runlist *phys_to_virt:  %x\n", *(u32*)phys_to_virt(runlist_iova));
-        head = *(struct entry_tsg*)phys_to_virt(runlist_iova);
-        rl_iter->curr_tsg = (struct entry_tsg*)phys_to_virt(runlist_iova);
-        rl_iter->rl_info = rl_info;
-        return 0;
-        //printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type);
-        //printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale);
-        //printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout);
-        //printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length);
-        //printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid);
-        //printk(KERN_INFO "[nvdebug] Mem base phys: %p\n", (void*)virt_to_phys((void*)0xffffffc000000000ULL));
-        //printk(KERN_INFO "[nvdebug] Mem end phys: %p\n", (void*)virt_to_phys((void*)0xffffffc400000000ULL));
-        //printk(KERN_INFO "[nvdebug] Runlist *virt_addr: %x\n", readl(virt_rt_addr)); // This crashes
-        //read_bytes(&head, virt_rt_addr, sizeof(struct entry_tsg));
-        /*printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type);
-        printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale);
-        printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout);
-        printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length);
-        printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid); */
 }
-int preempt_tsg(uint32_t tsg_id) {
+int preempt_tsg(struct nvdebug_state *g, uint32_t tsg_id) {
-        struct gk20a *g = get_live_gk20a();
        runlist_info_t rl_info;
        pfifo_preempt_t pfifo_preempt;
        runlist_disable_t rl_disable;
diff --git a/runlist_procfs.c b/runlist_procfs.c
index 411f844..a6b0d94 100644
--- a/runlist_procfs.c
+++ b/runlist_procfs.c
@@ -6,7 +6,14 @@
 #define RUNLIST_PROCFS_NAME "runlist"
 #define DETAILED_CHANNEL_INFO
-static int runlist_detail_seq_show_chan(struct seq_file *s, struct gk20a *g, uint32_t chid) {
+/* Print channel details using PCCSR (Programmable Channel Control System RAM?)
+ * @param s      Pointer to state from seq_file subsystem to pass to seq_printf
+ * @param g      Pointer to our internal GPU state
+ * @param chid   ID of channel to print details on, range [0, 512)
+ * @param prefix Text string to prefix each line with, or empty string
+ */
+#ifdef DETAILED_CHANNEL_INFO
+static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix) {
        channel_ctrl_t chan;
        char *loc_txt;
        u64 instance_ptr;
@@ -16,23 +23,37 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct gk20a *g, uin
                return -EIO;
        instance_ptr = chan.inst_ptr;
        instance_ptr <<= 12;
-        seq_printf(s, "  +- Channel Info %-4d -+\n", chid);
+        seq_printf(s, "%s+- Channel Info %-4d -+\n", prefix, chid);
-        seq_printf(s, "  | Enabled:           %d|\n", chan.enable);
+        seq_printf(s, "%s| Enabled:           %d|\n", prefix, chan.enable);
-        seq_printf(s, "  | Next:              %d|\n", chan.next);
+        seq_printf(s, "%s| Next:              %d|\n", prefix, chan.next);
-        seq_printf(s, "  | Force CTX Reload:  %d|\n", chan.force_ctx_reload);
+        seq_printf(s, "%s| Force CTX Reload:  %d|\n", prefix, chan.force_ctx_reload);
-        seq_printf(s, "  | Enable set:        %d|\n", chan.enable_set);
+        seq_printf(s, "%s| Enable set:        %d|\n", prefix, chan.enable_set);
-        seq_printf(s, "  | Enable clear:      %d|\n", chan.enable_clear);
+        seq_printf(s, "%s| Enable clear:      %d|\n", prefix, chan.enable_clear);
-        seq_printf(s, "  | PBDMA Faulted:     %d|\n", chan.pbdma_faulted);
+        seq_printf(s, "%s| PBDMA Faulted:     %d|\n", prefix, chan.pbdma_faulted);
-        seq_printf(s, "  | ENG Faulted:       %d|\n", chan.eng_faulted);
+        seq_printf(s, "%s| ENG Faulted:       %d|\n", prefix, chan.eng_faulted);
-        seq_printf(s, "  | Status:           %2d|\n", chan.status);
+        seq_printf(s, "%s| Status:           %2d|\n", prefix, chan.status);
-        seq_printf(s, "  | Busy:              %d|\n", chan.busy);
+        seq_printf(s, "%s| Busy:              %d|\n", prefix, chan.busy);
-        seq_printf(s, "  | Instance PTR:       |\n");
+        seq_printf(s, "%s| Instance PTR:       |\n", prefix);
-        seq_printf(s, "  | %#018llx  |\n", instance_ptr);
+        seq_printf(s, "%s| %#018llx  |\n", prefix, instance_ptr);
-        seq_printf(s, "  | %-20s|\n", loc_txt);
+        seq_printf(s, "%s| %-20s|\n", prefix, loc_txt);
-        seq_printf(s, "  | Instance bound:    %d|\n", chan.inst_bind);
+        seq_printf(s, "%s| Instance bound:    %d|\n", prefix, chan.inst_bind);
-        seq_printf(s, "  +---------------------+\n");
+        // START TEMP
+        // "runlist_id -1 is synonym for the ENGINE_GR_GK20A runlist id"
+        // GR, GRCE, and ASYNC_CE
+        // Note that this appears to be broken??
+        // Peek into the channel instance RAM
+        if (chan.inst_target == TARGET_SYS_MEM_COHERENT) {
+                seq_printf(s, "%s| Target Engine:    %2d|\n", prefix, *(uint32_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*43/*NV_RAMFC_TARGET*/) & 0x1f);
+                seq_printf(s, "%s| PDB LO:   %#08x|\n", prefix, *(uint32_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*128/*NV_RAMIN_PAGE_DIR_BASE_LO*/) & 0xfffff000);
+                seq_printf(s, "%s| Num subcontexts:  %2ld|\n", prefix, hweight64(*(uint64_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*166/*NV_RAMIN_SC_PDB_VALID*/)));
+                // This appears to be unset on Xavier
+                //seq_printf(s, "%s| PAS ID:     %8ld|\n", prefix, *(uint32_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*135/*NV_RAMIN_PASID*/) & 0xfffff);
+        }
+        // END TEMP
+        seq_printf(s, "%s+---------------------+\n", prefix);
        return 0;
 }
+#endif
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0)
 // Bug workaround. See comment in runlist_file_seq_start()
@@ -41,10 +62,14 @@ static loff_t pos_fixup;
 static void *runlist_file_seq_start(struct seq_file *s, loff_t *pos) {
        static struct runlist_iter rl_iter;
+        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)];
        // *pos == 0 for first call after read of file
        if (*pos == 0) {
-                int err = get_runlist_iter(&rl_iter);
+                int err = get_runlist_iter(g, seq2gpuidx(s), &rl_iter);
                if (err)
+                        return ERR_PTR(err);
+                // Don't try to print an empty runlist
+                if (rl_iter.rl_info.len <= 0)
                        return NULL;
                return &rl_iter;
        }
@@ -68,12 +93,13 @@ static void* runlist_file_seq_next(struct seq_file *s, void *raw_rl_iter,
                                   loff_t *pos) {
        struct runlist_iter* rl_iter = raw_rl_iter;
        void *ret = NULL;
-        // Advance by one TSG + channels under last TSG
+        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)];
-        *pos += 1 + rl_iter->curr_tsg->tsg_length;
+        // Advance by one TSG or channel
+        (*pos)++;
+        rl_iter->curr_entry += NV_RL_ENTRY_SIZE(g);
        // Verify we haven't reached the end of the runlist
        // rl_info.len is the num of tsg entries + total num of channel entries
        if (*pos < rl_iter->rl_info.len) {
-                rl_iter->curr_tsg = next_tsg(rl_iter->curr_tsg);
                ret = rl_iter;
        }
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0)
@@ -88,57 +114,57 @@ static void runlist_file_seq_stop(struct seq_file *s, void *raw_rl_iter) {
 }
 static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) {
-        struct entry_tsg* tsg = ((struct runlist_iter*)raw_rl_iter)->curr_tsg;
+        struct runlist_iter *rl_iter = raw_rl_iter;
-        struct runlist_chan* chan;
+        void *entry = rl_iter->curr_entry;
-        struct gk20a *g = get_live_gk20a();
+        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)];
-        if (!g)
+        if (entry_type(g, entry) == ENTRY_TYPE_TSG) {
-                return -EIO;
+                if (rl_iter->channels_left_in_tsg) {
-        if (tsg->entry_type != ENTRY_TYPE_TSG) {
+                        printk(KERN_WARNING "[nvdebug] Found a TSG @ %px when %d channels were still expected under the previous TSG in the runlist!\n", entry, rl_iter->channels_left_in_tsg);
-                printk(KERN_WARNING "[nvdebug] Attempted to print non-TSG in tsg print logic!\n");
+                        return -EIO;
-                return -EIO;
+                }
-        }
+                rl_iter->channels_left_in_tsg = tsg_length(g, entry);
-        seq_printf(s, "+---- TSG Entry %-2d----+\n", tsg->tsgid);
+                seq_printf(s, "+---- TSG Entry %-3d---+\n", tsgid(g, entry));
-        seq_printf(s, "| Scale: %-13d|\n", tsg->timeslice_scale);
+                seq_printf(s, "| Scale: %-13d|\n", timeslice_scale(g, entry));
-        seq_printf(s, "| Timeout: %-11d|\n", tsg->timeslice_timeout);
+                seq_printf(s, "| Timeout: %-11d|\n", timeslice_timeout(g, entry));
-        seq_printf(s, "+---------------------+\n");
+                seq_printf(s, "| Length: %-12d|\n", tsg_length(g, entry));
-        for_chan_in_tsg(chan, tsg) {
+                seq_printf(s, "+---------------------+\n");
+        } else {
+                char *indt = "";
 #ifndef DETAILED_CHANNEL_INFO
-                char* loc_txt;
+                u64 instance_ptr = 0;
-                u64 instance_ptr;
 #endif
-                if (chan->entry_type != ENTRY_TYPE_CHAN) {
+                if (rl_iter->channels_left_in_tsg) {
-                        printk(KERN_WARNING "[nvdebug] Attempted to print non-channel in channel print logic!\n");
+                        indt = "  ";
-                        return -EIO;
+                        rl_iter->channels_left_in_tsg--;
                }
 #ifdef DETAILED_CHANNEL_INFO
-                runlist_detail_seq_show_chan(s, g, chan->chid);
+                runlist_detail_seq_show_chan(s, g, chid(g, entry), indt);
 #else
-                loc_txt = target_to_text(chan->inst_target);
-                if (!loc_txt) {
-                        printk(KERN_WARNING "[nvdebug] Invalid apature in channel print logic!\n");
-                        return -EIO;
-                }
                // Reconstruct pointer to channel instance block
-                instance_ptr = chan->inst_ptr_hi;
+                if (g->chip_id >= NV_CHIP_ID_VOLTA) {
-                instance_ptr <<= 32;
+                        instance_ptr = ((struct gv100_runlist_chan*)entry)->inst_ptr_hi;
-                instance_ptr |= chan->inst_ptr_lo << 12;
+                        instance_ptr <<= 32;
+                }
-                seq_printf(s, "  +- Channel Entry %-4d-+\n", chan->chid);
+                instance_ptr |= inst_ptr_lo(g, entry) << 12;
-                seq_printf(s, "  | Runqueue Selector: %d|\n", chan->runqueue_selector);
-                seq_printf(s, "  | Instance PTR:       |\n");
+                seq_printf(s, "%s+- Channel Entry %-4d-+\n", indt, chid(g, entry));
-                seq_printf(s, "  | %#018llx  |\n", instance_ptr);
+                if (g->chip_id >= NV_CHIP_ID_VOLTA)
-                seq_printf(s, "  | %-20s|\n", loc_txt);
+                        seq_printf(s, "%s| Runqueue Selector: %d|\n", indt,
-                seq_printf(s, "  +---------------------+\n");
+                                   ((struct gv100_runlist_chan*)entry)->runqueue_selector);
+                seq_printf(s, "%s| Instance PTR:       |\n", indt);
+                seq_printf(s, "%s| %#018llx  |\n", indt, instance_ptr);
+                seq_printf(s, "%s| %-20s|\n", indt, target_to_text(inst_target(g, entry)));
+                seq_printf(s, "%s+---------------------+\n", indt);
 #endif
        }
        return 0;
 }
 static const struct seq_operations runlist_file_seq_ops = {
-        .start = runlist_file_seq_start,
+        .start = runlist_file_seq_start,
-        .next = runlist_file_seq_next,
+        .next = runlist_file_seq_next,
-        .stop = runlist_file_seq_stop,
+        .stop = runlist_file_seq_stop,
-        .show = runlist_file_seq_show,
+        .show = runlist_file_seq_show,
 };
 static int runlist_file_open(struct inode *inode, struct file *f) {
@@ -157,6 +183,7 @@ ssize_t preempt_tsg_file_write(struct file *f, const char __user *buffer,
        uint32_t target_tsgid;
        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
        int err = kstrtou32_from_user(buffer, count, 0, &target_tsgid);
+        struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
        if (err)
                return err;
@@ -165,7 +192,7 @@ ssize_t preempt_tsg_file_write(struct file *f, const char __user *buffer,
                return -ERANGE;
        // Execute preemption
-        err = preempt_tsg(target_tsgid);
+        err = preempt_tsg(g, target_tsgid);
        if (err)
                return err;
@@ -181,9 +208,9 @@ ssize_t disable_channel_file_write(struct file *f, const char __user *buffer,
        uint32_t target_channel;
        channel_ctrl_t chan;
        int err;
-        struct gk20a *g = get_live_gk20a();
+        runlist_info_t rl_info;
-        if (!g)
+        runlist_disable_t rl_disable;
-                return -EIO;
+        struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
        err = kstrtou32_from_user(buffer, count, 0, &target_channel);
        if (err)
@@ -195,7 +222,16 @@ ssize_t disable_channel_file_write(struct file *f, const char __user *buffer,
        // Disable channel
        chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel));
        chan.enable_clear = true;
+        // disable sched
+        rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST);
+        rl_disable.raw = nvdebug_readl(g, NV_PFIFO_SCHED_DISABLE);
+        rl_disable.raw |= BIT(rl_info.id);
+        nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);
+        // disable chan
        nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(target_channel), chan.raw);
+        // enable sched
+        rl_disable.raw &= ~BIT(rl_info.id);
+        nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);
        return count;
 }
@@ -209,9 +245,7 @@ ssize_t enable_channel_file_write(struct file *f, const char __user *buffer,
        uint32_t target_channel;
        channel_ctrl_t chan;
        int err;
-        struct gk20a *g = get_live_gk20a();
+        struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
-        if (!g)
-                return -EIO;
        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
        err = kstrtou32_from_user(buffer, count, 0, &target_channel);
        if (err)
@@ -235,14 +269,12 @@ const struct file_operations enable_channel_file_ops = {
 ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
                                   size_t count, loff_t *off) {
        uint32_t target_tsgid;
-        struct runlist_chan* chan;
+        struct gv100_runlist_chan* chan;
        channel_ctrl_t chan_ctl;
        struct runlist_iter rl_iter;
        int err;
        loff_t pos = 0;
-        struct gk20a *g = get_live_gk20a();
+        struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
-        if (!g)
-                return -EIO;
        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
        err = kstrtou32_from_user(buffer, count, 0, &target_tsgid);
        if (err)
@@ -251,32 +283,34 @@ ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
        if (target_tsgid > MAX_TSGID)
                return -ERANGE;
-        err = get_runlist_iter(&rl_iter);
+        err = get_runlist_iter(g, 0, &rl_iter);
        if (err)
                return err;
        // Iterate through all TSGs
        while (pos < rl_iter.rl_info.len) {
-                if (rl_iter.curr_tsg->tsgid == target_tsgid) {
+                if (tsgid(g, rl_iter.curr_entry) == target_tsgid) {
                        // Enable channels of target TSG
-                        for_chan_in_tsg(chan, rl_iter.curr_tsg) {
+                        for_chan_in_tsg(g, chan, rl_iter.curr_entry) {
                                chan_ctl.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chan->chid));
                                chan_ctl.enable_set = true;
                                nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(chan->chid), chan_ctl.raw);
                        }
                } else {
+                        // XXX: Fix for bare channels. Maybe a "for_chan_until_tsg" macro?
                        // Disable all other channels
-                        for_chan_in_tsg(chan, rl_iter.curr_tsg) {
+                        // (This is how the Jetson nvgpu driver disables TSGs)
+                        for_chan_in_tsg(g, chan, rl_iter.curr_entry) {
                                chan_ctl.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chan->chid));
                                chan_ctl.enable_clear = true;
                                nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(chan->chid), chan_ctl.raw);
                        }
                }
-                pos += 1 + rl_iter.curr_tsg->tsg_length;
+                pos += 1 + tsg_length(g, rl_iter.curr_entry);
-                rl_iter.curr_tsg = next_tsg(rl_iter.curr_tsg);
+                rl_iter.curr_entry = next_tsg(g, rl_iter.curr_entry);
        }
        // Switch to next TSG with active channels (should be our TSG)
-        err = preempt_tsg(target_tsgid);
+        err = preempt_tsg(g, target_tsgid);
        if (err)
                return err;
diff --git a/stubs.h b/stubs.h
new file mode 100644
index 0000000..bfcc0d7
--- /dev/null
+++ b/stubs.h
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helpful private functions copied from elsewhere in the kernel tree
+ * DO NOT MODIFY
+ */
+#include <linux/version.h>
+// Functions from drivers/pci/pci.h
+/**
+ * pci_match_one_device - Tell if a PCI device structure has a matching
+ *                        PCI device id structure
+ * @id: single PCI device id structure to match
+ * @dev: the PCI device structure to match against
+ *
+ * Returns the matching pci_device_id structure or %NULL if there is no match.
+ */
+static inline const struct pci_device_id *
+pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
+{
+        if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
+            (id->device == PCI_ANY_ID || id->device == dev->device) &&
+            (id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) &&
+            (id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) &&
+            !((id->class ^ dev->class) & id->class_mask))
+                return id;
+        return NULL;
+}
+// Functions from drivers/pci/search.h
+#include <linux/device.h>
+#include <linux/pci.h>
+extern struct bus_type pci_bus_type;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,3,0)
+static int match_pci_dev_by_id(struct device *dev, void *data)
+#else
+static int match_pci_dev_by_id(struct device *dev, const void *data)
+#endif
+{
+        struct pci_dev *pdev = to_pci_dev(dev);
+        const struct pci_device_id *id = data;
+        if (pci_match_one_device(id, pdev))
+                return 1;
+        return 0;
+}
+/*
+ * pci_get_dev_by_id - begin or continue searching for a PCI device by id
+ * @id: pointer to struct pci_device_id to match for the device
+ * @from: Previous PCI device found in search, or %NULL for new search.
+ *
+ * Iterates through the list of known PCI devices.  If a PCI device is found
+ * with a matching id a pointer to its device structure is returned, and the
+ * reference count to the device is incremented.  Otherwise, %NULL is returned.
+ * A new search is initiated by passing %NULL as the @from argument.  Otherwise
+ * if @from is not %NULL, searches continue from next device on the global
+ * list.  The reference count for @from is always decremented if it is not
+ * %NULL.
+ *
+ * This is an internal function for use by the other search functions in
+ * this file.
+ */
+static struct pci_dev *pci_get_dev_by_id(const struct pci_device_id *id,
+                                         struct pci_dev *from)
+{
+        struct device *dev;
+        struct device *dev_start = NULL;
+        struct pci_dev *pdev = NULL;
+        if (from)
+                dev_start = &from->dev;
+        dev = bus_find_device(&pci_bus_type, dev_start, (void *)id,
+                              match_pci_dev_by_id);
+        if (dev)
+                pdev = to_pci_dev(dev);
+        pci_dev_put(from);
+        return pdev;
+}
+
author	Joshua Bakita <bakitajoshua@gmail.com>	2023-06-22 12:52:59 -0400
committer	Joshua Bakita <bakitajoshua@gmail.com>	2023-06-22 12:52:59 -0400
commit	306a03d18b305e4e573be3b2931978fa10679eb9 (patch)
tree	349570dfbe5f531e903c949c3f663627ee1097a8
parent	f4b83713672acaf88a526b930b8e417453f6edc5 (diff)