From 306a03d18b305e4e573be3b2931978fa10679eb9 Mon Sep 17 00:00:00 2001
From: Joshua Bakita <bakitajoshua@gmail.com>
Date: Thu, 22 Jun 2023 12:52:59 -0400
Subject: Quick dump of current state for Ben to review.

---
 Makefile             |  13 +-
 device_info_procfs.c | 126 +++++++++
 mmu.c                | 251 ++++++++++++++++++
 nvdebug.h            | 719 +++++++++++++++++++++++++++++++++++++++++++++++----
 nvdebug_entry.c      | 288 ++++++++++++++++++---
 runlist.c            | 221 ++++++++--------
 runlist_procfs.c     | 188 ++++++++------
 stubs.h              |  80 ++++++
 8 files changed, 1614 insertions(+), 272 deletions(-)
 create mode 100644 device_info_procfs.c
 create mode 100644 mmu.c
 create mode 100644 stubs.h

diff --git a/Makefile b/Makefile
index 18c07e8..2dc90c7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,13 +1,14 @@
 obj-m += nvdebug.o
-nvdebug-objs = runlist_procfs.o runlist.o nvdebug_entry.o
+nvdebug-objs = runlist_procfs.o device_info_procfs.o runlist.o mmu.o nvdebug_entry.o
 KBUILD_CFLAGS += -DGIT_HASH=\"$(shell git --git-dir=$(PWD)/.git rev-parse --short HEAD)\"
+# -mfentry above if not building due to mcount missing
 
 # TODO: Avoid needing to distribute NVIDIA's headers (at least they're MIT...)
-#ccflags-y += -I$(PWD)/include
-ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include
-ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu
-ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include
-ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include/uapi
+ccflags-y += -I$(PWD)/include
+#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include
+#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu
+#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include
+#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include/uapi
 
 all:
 	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
diff --git a/device_info_procfs.c b/device_info_procfs.c
new file mode 100644
index 0000000..cd6c53c
--- /dev/null
+++ b/device_info_procfs.c
@@ -0,0 +1,126 @@
+#include "nvdebug.h"
+#include <linux/seq_file.h> // For seq_* functions and types
+#include <linux/uaccess.h> // For copy_to_user()
+
+// Generic register printing function, used for PTOP_*_NUM registers (+more)
+// @param f    File being read from. `data` field is register offset to read.
+// @param buf  User buffer for result
+// @param size Length of user buffer
+// @param off  Requested offset. Updated by number of characters written.
+// @return -errno on error, otherwise number of bytes written to *buf
+// Note: Parent `data` field MUST be the GPU index
+static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, loff_t *off) {
+	char out[16];
+	int chars_written;
+	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
+	if (size < 16 || *off != 0)
+		return 0;
+	// 32 bit register will always take less than 16 characters to print
+	chars_written = scnprintf(out, 16, "%#0x\n", nvdebug_readl(g, (uintptr_t)PDE_DATA(file_inode(f))));
+	if (copy_to_user(buf, out, chars_written))
+		printk(KERN_WARNING "Unable to copy all data for %s\n", file_dentry(f)->d_name.name);
+	*off += chars_written;
+	return chars_written;
+}
+const struct file_operations nvdebug_read_reg32_file_ops = {
+	.read = nvdebug_reg32_read,
+};
+
+//// ==v== PTOP_DEVICE_INFO ==v== ////
+
+// Called to start or resume a sequence. Prior to 4.19, *pos is unreliable.
+// Initializes iterator `idx` state and returns it. Ends sequence on NULL.
+static void* device_info_file_seq_start(struct seq_file *s, loff_t *pos) {
+	static int idx;
+	// If start of sequence, reset `idx`
+	if (*pos == 0)
+		idx = 0;
+	// Number of possible info entries is fixed, and list is sparse
+	if (idx >= NV_PTOP_DEVICE_INFO__SIZE_1)
+		return NULL;
+	return &idx;
+}
+
+// Steps to next record. Returns new value of `idx`.
+// Calls show() on non-NULL return
+static void* device_info_file_seq_next(struct seq_file *s, void *idx,
+				       loff_t *pos) {
+	(*pos)++; // Required by seq interface
+	// Number of possible info entries is fixed, and list is sparse
+	if ((*(int*)idx)++ >= NV_PTOP_DEVICE_INFO__SIZE_1)
+		return NULL;
+	return idx;
+}
+
+// Print info at index *idx. Returns non-zero on error.
+static int device_info_file_seq_show(struct seq_file *s, void *idx) {
+	ptop_device_info_t curr_info;
+	struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)];
+	
+	curr_info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO(*(int*)idx));
+	// Check for read errors
+	if (curr_info.raw == -1)
+		return -EIO;
+
+	// Parse and print the data
+	switch(curr_info.info_type) {
+	case INFO_TYPE_DATA:
+		// As of early 2022, only the ENUM2 format of this entry exists
+		if (curr_info.is_not_enum2)
+			break;
+		seq_printf(s, "| BAR0 Base %#.8x\n"
+			      "|           instance %d\n",
+			curr_info.pri_base << 12, curr_info.inst_id);
+		if (curr_info.fault_id_is_valid)
+			seq_printf(s, "| Fault ID:        %3d\n", curr_info.fault_id);
+		break;
+	case INFO_TYPE_ENUM:
+		if (curr_info.engine_is_valid)
+			seq_printf(s, "| Host's Engine ID: %2d\n", curr_info.engine_enum);
+		if (curr_info.runlist_is_valid)
+			seq_printf(s, "| Runlist ID:       %2d\n", curr_info.runlist_enum);
+		if (curr_info.intr_is_valid)
+			seq_printf(s, "| Interrupt ID:     %2d\n", curr_info.intr_enum);
+		if (curr_info.reset_is_valid)
+			seq_printf(s, "| Reset ID:         %2d\n", curr_info.reset_enum);
+		break;
+	case INFO_TYPE_ENGINE_TYPE:
+		seq_printf(s, "| Engine Type:      %2d (", curr_info.engine_type);
+		if (curr_info.engine_type < ENGINE_TYPES_LEN)
+			seq_printf(s, "%s)\n", ENGINE_TYPES_NAMES[curr_info.engine_type]);
+		else
+			seq_printf(s, "Unknown Engine, introduced post-Ampere)\n");
+		break;
+	case INFO_TYPE_NOT_VALID:
+	default:
+		// Device info records are sparse, so skip unset or unknown ones
+		return 0;
+	}
+
+	// Draw a line between each device entry
+	if (!curr_info.has_next_entry)
+		seq_printf(s, "+---------------------+\n");
+	return 0;
+}
+
+static void device_info_file_seq_stop(struct seq_file *s, void *idx) {
+        // No cleanup needed
+}
+
+static const struct seq_operations device_info_file_seq_ops = {
+	.start = device_info_file_seq_start,
+	.next = device_info_file_seq_next,
+	.stop = device_info_file_seq_stop,
+	.show = device_info_file_seq_show,
+};
+
+static int device_info_file_open(struct inode *inode, struct file *f) {
+	return seq_open(f, &device_info_file_seq_ops);
+}
+
+const struct file_operations device_info_file_ops = {
+	.open = device_info_file_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
diff --git a/mmu.c b/mmu.c
new file mode 100644
index 0000000..26c7af5
--- /dev/null
+++ b/mmu.c
@@ -0,0 +1,251 @@
+// Helpers to deal with NVIDIA's MMU and associated page tables
+#include <linux/kernel.h>  // Kernel types
+
+#include "nvdebug.h"
+
+/* One of the oldest ways to access video memory on NVIDIA GPUs is by using
+  a configurable 1MB window into VRAM which is mapped into BAR0 (register)
+  space starting at offset NV_PRAMIN. This is still supported on NVIDIA GPUs
+  and appear to be used today to bootstrap page table configuration.
+
+  Why is it mapped at a location called NVIDIA Private RAM Instance? Because
+  this used to point to the entirety of intance RAM, which was seperate from
+  VRAM on older NVIDIA GPUs.
+*/
+
+/* Convert a physical VRAM address to an offset in the PRAMIN window
+  @param addr VRAM address to convert
+  @return 0 on error, PRAMIN offset on success
+
+  Note: Use off2PRAMIN() instead if you want a dereferenceable address
+*/
+uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr) {
+	uint64_t pramin_base_va;
+	bar0_window_t window;
+	window.raw = nvdebug_readl(g, NV_PBUS_BAR0_WINDOW);
+	// Check if the address is valid (49 bits are addressable on-GPU)
+	if (addr & ~0x0001ffffffffffff) {
+		printk(KERN_ERR "[nvdebug] Invalid address %llx passed to %s!\n",
+		       addr, __func__);
+		return 0;
+	}
+	// For unclear (debugging?) reasons, PRAMIN can point to SYSMEM
+	if (window.target != TARGET_VID_MEM)
+		return 0;
+	pramin_base_va = ((uint64_t)window.base) << 16;
+	// Protect against out-of-bounds accesses
+	if (addr < pramin_base_va || addr > pramin_base_va + NV_PRAMIN_LEN)
+		return 0;
+	return addr - pramin_base_va;
+}
+
+/* NVIDIA GMMU (GPU Memory Management Unit) uses page tables that are mostly
+  straight-forward starting with Pascal ("page table version 2"), except for a
+  few quirks (like 16-byte PDE0 entries, but all other entries are 8 bytes).
+
+  All you really need to know is that any given Page Directory Entry (PDE)
+  contains a pointer to the start of a 4k page densely filled with PDEs or Page
+  Table Entries (PTEs).
+
+  == Page Table Refresher ==
+  Page tables convert virtual addresses to physical addresses, and they do this
+  via a tree structure. Leafs (PTEs) contain a physical address, and the path
+  from root to leaf is defined by the virtual address. Non-leaf nodes are PDEs.
+  When decending, the virtual address is sliced into pieces, and one slice is
+  used at each level (as an index) to select the next-visited node (in level+1).
+
+  V2 of NVIDIA's page table format uses 4 levels of PDEs and a final level of
+  PTEs. How the virtual address is sliced to yield an index into each level and
+  a page offset is shown by Fig 1.
+
+  == Figure 1 ==
+  Page Offset (12 bits) <---------------------------------------+
+  Page Table Entry (PTE) (9 bits) <--------------------+        |
+  Page Directory Entry (PDE) 0 (8 bits) <-----+        |        |
+  PDE1 (8 bits) <--------------------+        |        |        |
+  PDE2 (8 bits) <-----------+        |        |        |        |
+  PDE3 (2 bits) <--+        |        |        |        |        |
+                   ^        ^        ^        ^        ^        ^
+  Virtual addr: [49, 47] [46, 38] [37, 29] [28, 21] [20, 12] [11, 0]
+
+  The following arrays merely represent different projections of Fig. 1, and
+  only one is strictly needed to reconstruct all the others. However, due to
+  the complexity of page tables, we include all of these to aid in readability.
+*/
+// How many nodes/entries per level in V2 of NVIDIA's page table format
+static const int NV_MMU_PT_V2_SZ[5] = {4, 512, 512, 256, 512};
+// Size in bytes of an entry at a particular level
+static const int NV_MMU_PT_V2_ENTRY_SZ[5] = {8, 8, 8, 16, 8};
+// Which bit index is the least significant in indexing each page level
+static const int NV_MMU_PT_V2_LSB[5] = {47, 38, 29, 21, 12};
+
+// Convert a GPU physical address to CPU virtual address via the PRAMIN window
+void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy) {
+	return g->regs + NV_PRAMIN + vram2PRAMIN(g, phy);
+}
+
+/* FIXME
+void __iomem *off2BAR2(struct nvdebug_state* g, uint32_t off) {
+	return g->bar2 + off;
+}
+*/
+
+uint64_t search_page_directory_subtree(struct nvdebug_state *g,
+				       void __iomem *pde_offset,
+				       void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+				       uint64_t addr_to_find,
+				       uint32_t level) {
+	uint64_t res, i;
+	void __iomem *next;
+	page_dir_entry_t entry;
+	if (level > sizeof(NV_MMU_PT_V2_SZ))
+		return 0;
+	// Hack to workaround PDE0 being double-size and strangely formatted
+	if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
+		pde_offset += 8;
+	entry.raw = readl(pde_offset);
+	// If we reached an invalid (unpopulated) PDE, walk back up the tree
+	if (entry.target == PD_AND_TARGET_INVALID)
+		return 0;
+	// Succeed when we reach a PTE with the address we want
+	if (entry.is_pte) {
+		printk(KERN_INFO "[nvdebug] PTE for phy addr %llx (raw: %x)\n", ((u64)entry.addr) << 12, entry.raw);
+		return (uint64_t)entry.addr << 12 == addr_to_find;
+	}
+	printk(KERN_INFO "[nvdebug] Found PDE pointing to %llx in ap '%d' at lvl %d (raw: %x)\n", ((u64)entry.addr) << 12, entry.target, level, entry.raw);
+	// Depth-first search of the page table
+	for (i = 0; i < NV_MMU_PT_V2_SZ[level]; i++) {
+		next = off2addr(g, ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i);
+		// off2addr can fail
+		if (!next) {
+			printk(KERN_ERR "[nvdebug] %s: Unable to resolve GPU PA to CPU PA\n", __func__);
+			return 0;
+		}
+		res = search_page_directory_subtree(g, next, off2addr, addr_to_find, level + 1);
+		if (res)
+			return res | (i << NV_MMU_PT_V2_LSB[level + 1]);
+	}
+	return 0;
+}
+
+/* Search a page directory of the GPU MMU
+  @param pde_offset   Dereferenceable pointer to the start of the PDE3 entries
+  @param off2addr     Func to converts VRAM phys addresses to valid CPU VAs
+  @param addr_to_find Physical address to reconstruct the virtual address of
+  @return 0 on error, otherwise the virtual address at which addr_to_find is
+          mapped into by this page table.
+*/
+uint64_t search_page_directory(struct nvdebug_state *g,
+			       void __iomem *pde_offset,
+			       void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+			       uint64_t addr_to_find) {
+	uint64_t res, i;
+	// Make sure that the query is page-aligned
+	if (addr_to_find & 0xfff) {
+		printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find);
+		return 0;
+	}
+	// Search the top-level page directory (PDE3)
+	for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++)
+		if ((res = search_page_directory_subtree(g, pde_offset + NV_MMU_PT_V2_ENTRY_SZ[0] * i, off2addr, addr_to_find, 0)))
+			return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]);
+	return 0;
+}
+
+/* GMMU Page Tables Version 1
+  This page table only contains 2 levels and is used in the Fermi, Kepler, and
+  Maxwell architectures
+*/
+// Number of entries in the PDE and PTE levels
+static const int NV_MMU_PT_V1_SZ[2] = {512, 1<<13};  // 2<<13 is an educated guess!!!
+// Which bit index is the least significant in indexing each page level
+static const int NV_MMU_PT_V1_LSB[2] = {25, 12};  // 25 is an educated guess!!!
+uint64_t search_v1_page_directory(struct nvdebug_state *g,
+				  void __iomem *pde_offset,
+				  void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+				  uint64_t addr_to_find) {
+	uint64_t j, i = 0;
+	page_dir_entry_v1_t pde;
+	page_tbl_entry_v1_t pte;
+	void __iomem *pte_offset;
+	// For each PDE
+	do {
+		// readq doesn't seem to work on BAR0
+		pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v1_t) + 4);
+		pde.raw <<= 32;
+		pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v1_t));
+		// Verify PDE is present
+		if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
+			continue;
+		// Convert to a dereferencable pointer from CPU virtual address space
+		pte_offset = off2addr(g, (uint64_t)pde.alt_addr << 12);
+		if (!pte_offset)
+			continue;
+//		printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw);
+//		printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.target, pde.raw);
+		// For each PTE
+		for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
+			// Don't overrun the PRAMIN window
+			if (pte_offset > NV_PRAMIN + g->regs + NV_PRAMIN_LEN)
+				return 0;
+			pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v1_t) + 4);
+			pte.raw <<= 32;
+			pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v1_t));
+			// Skip non-present PTEs
+			if (!pte.is_present)
+				continue;
+//			printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw);
+			// If we find a matching PTE, return its virtual address
+			if ((uint64_t)pte.addr << 12 == addr_to_find)
+				return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1];
+			
+		}
+	} while (++i < NV_MMU_PT_V1_SZ[0]);
+	return 0;
+}
+
+/* GMMU Page Tables Version 0
+  This page table only contains 2 levels and is used in the Tesla architecture
+*/
+/* *** UNTESTED ***
+#define NV_MMU_PT_V0_SZ 2048
+#define NV_MMU_PT_V0_LSB 29
+uint64_t search_v0_page_directory(struct nvdebug_state *g,
+				  void __iomem *pde_offset,
+				  void __iomem *(*off2addr)(struct nvdebug_state*, uint32_t),
+				  uint32_t addr_to_find) {
+	int j, i = 0;
+	page_dir_entry_v0_t pde;
+	page_tbl_entry_v0_t pte;
+	void __iomem *pte_offset;
+	// For each PDE
+	do {
+		// readq doesn't seem to work on BAR0
+		pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v0_t) + 4);
+		pde.raw <<= 32;
+		pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v0_t));
+		//if (pde.raw)
+		//printk(KERN_INFO "[nvdebug] Read raw PDE @ %x: %llx\n", pde_offset + i * sizeof(page_dir_entry_v1_t), pde.raw);
+		// Skip unpopulated PDEs
+		if (pde.type == NOT_PRESENT)
+			continue;
+		//printk(KERN_INFO "[nvdebug] PDE to %llx present\n", ((uint64_t)pde.addr) << 12);
+		pte_offset = off2addr(g, ((uint64_t)pde.addr) << 12);
+		// For each PTE
+		for (j = 0; j < V0_PDE_SIZE2NUM[pde.sublevel_size]; j++) {
+			pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v0_t) + 4);
+			pte.raw <<= 32;
+			pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v0_t));
+			// Skip non-present PTEs
+			if (!pte.is_present)
+				continue;
+			// If we find a matching PTE, return its virtual address
+			//if (pte.addr != 0x5555555)
+			//	printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s\n", ((uint64_t)pte.addr) << 12, pte.is_present ? "present" : "non-present");
+			if (pte.addr << 12 == addr_to_find)
+				return i << NV_MMU_PT_V0_LSB | j << 12;
+		}
+	} while (++i < NV_MMU_PT_V0_SZ);
+	return 0;  // No match
+}
+*/
diff --git a/nvdebug.h b/nvdebug.h
index 9ac71da..1882756 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -5,14 +5,18 @@
 // TODO(jbakita): Don't depend on these.
 #include <nvgpu/gk20a.h>  // For struct gk20a
 #include <os/linux/os_linux.h>  // For struct nvgpu_os_linux
+#include <linux/proc_fs.h>  // For PDE_DATA() macro
 
 /* Runlist Channel
   A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
   of GPU commands. These commands are typically queued from userspace.
 
-  `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU
-  virtual address space for this context. All channels in a TSG point to the
-  same GPU Instance Block (?).
+  Prior to Volta, channels could also exist independent of a TSG. These are
+  called "bare channels" in the Jetson nvgpu driver.
+
+  `INST_PTR` points to a GPU Instance Block which contains FIFO states, virtual
+  address space configuration for this context, and a pointer to the page
+  tables. All channels in a TSG point to the same GPU Instance Block (?).
 
   "RUNQUEUE_SELECTOR determines to which runqueue the channel belongs, and
   thereby which PBDMA will run the channel.  Increasing values select
@@ -30,7 +34,13 @@
   ENTRY_TYPE (T)        : type of this entry: ENTRY_TYPE_CHAN
   CHID (ID)             : identifier of the channel to run (overlays ENTRY_ID)
   RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if
-                          more than one PBDMA is supported by the runlist
+                          more than one PBDMA is supported by the runlist,
+                          additionally, "A value of 0 targets the first FE
+                          pipe, which can process all FE driven engines:
+                          Graphics, Compute, Inline2Memory, and TwoD.  A value
+                          of 1 targets the second FE pipe, which can only
+                          process Compute work.  Note that GRCE work is allowed
+                          on either runqueue.)"
 
   INST_PTR_LO           : lower 20 bits of the 4k-aligned instance block pointer
   INST_PTR_HI           : upper 32 bit of instance block pointer
@@ -39,6 +49,9 @@
   USERD_PTR_LO          : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer
   USERD_PTR_HI          : upper 32 bits of USERD pointer
   USERD_TARGET (TGU)    : aperture of the USERD data structure
+
+  Channels were around since at least Fermi, but were rearranged with Volta to
+  add a USERD pointer, a longer INST pointer, and a runqueue selector flag.
 */
 enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
 enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
@@ -52,11 +65,12 @@ static inline char* target_to_text(enum INST_TARGET t) {
 			return "SYS_MEM_NONCOHERENT";
 		default:
 			printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
-			return NULL;
+			return "INVALID";
 	}
 }
 
-struct runlist_chan {
+// Support: Volta, Ampere, Turing
+struct gv100_runlist_chan {
 // 0:63
 	enum ENTRY_TYPE entry_type:1;
 	uint32_t runqueue_selector:1;
@@ -71,6 +85,20 @@ struct runlist_chan {
 	uint32_t inst_ptr_hi:32;
 } __attribute__((packed));
 
+// Support: Fermi, Kepler*, Maxwell, Pascal
+// *In Kepler, inst fields may be unpopulated?
+struct gm107_runlist_chan {
+	uint32_t chid:12;
+	 uint32_t padding0:1;
+	enum ENTRY_TYPE entry_type:1;
+	 uint32_t padding1:18;
+	uint32_t inst_ptr_lo:20;
+	enum INST_TARGET inst_target:2;  // Totally guessing on this
+	 uint32_t padding2:10;
+} __attribute__((packed));
+
+#define gk110_runlist_chan gm107_runlist_chan
+
 /* Runlist TSG (TimeSlice Group)
   The runlist is composed of timeslice groups (TSG). Each TSG corresponds
   to a single virtual address space on the GPU and contains `TSG_LENGTH`
@@ -85,8 +113,15 @@ struct runlist_chan {
   TIMESLICE_TIMEOUT   : timeout amount for the TSG's timeslice
   TSG_LENGTH          : number of channels that are part of this timeslice group
   TSGID               : identifier of the Timeslice group (overlays ENTRY_ID)
+
+  TSGs appear to have been introduced with Kepler and stayed the same until
+  they were rearranged at the time of channel rearrangement to support longer
+  GPU instance addresses with Volta.
 */
-struct entry_tsg {
+
+// Support: Volta, Ampere*, Turing*
+// *These treat the top 8 bits of TSGID as GFID (unused)
+struct gv100_runlist_tsg {
 // 0:63
 	enum ENTRY_TYPE entry_type:1;
 	 uint64_t padding:15;
@@ -101,14 +136,28 @@ struct entry_tsg {
 } __attribute__((packed));
 #define MAX_TSGID (1 << 12)
 
+// Support: Kepler (v2?), Maxwell, Pascal
+// Same fields as Volta except tsg_length is 6 bits rather than 8
+// Last 32 bits appear to contain an undocumented inst ptr
+struct gk110_runlist_tsg {
+	uint32_t tsgid:12;
+	 uint32_t padding0:1;
+	enum ENTRY_TYPE entry_type:1;
+	uint32_t timeslice_scale:4;
+	uint32_t timeslice_timeout:8;
+	uint32_t tsg_length:6;
+	 uint32_t padding1:32;
+} __attribute__((packed));
+
+
 enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1};
 
 /* Preempt a TSG or Channel by ID
-  ID/CHID             : Id of TSG or channel to preempt
-  IS_PENDING          : ????
-  TYPE                : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG
+  ID/CHID     : Id of TSG or channel to preempt
+  IS_PENDING  : Is a context switch pending?
+  TYPE        : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG
 
-  Support: Kepler, Maxwell, Pascal, Volta
+  Support: Kepler, Maxwell, Pascal, Volta, Turing
 */
 #define NV_PFIFO_PREEMPT 0x00002634
 typedef union {
@@ -195,26 +244,36 @@ typedef union {
  */
 
 // Note: This is different with Turing
-// Support: Kepler, Maxwell, Pascal, Volta
+// Support: Fermi, Kepler, Maxwell, Pascal, Volta
 #define NV_PFIFO_RUNLIST_BASE 0x00002270
+#define NV_PFIFO_ENG_RUNLIST_BASE(i) (0x00002280+(i)*8)
 typedef union {
 	struct {
 		uint32_t ptr:28;
-		uint32_t type:2;
+		enum INST_TARGET target:2;
 		 uint32_t padding:2;
 	} __attribute__((packed));
 	uint32_t raw;
 } runlist_base_t;
 
 // Support: Kepler, Maxwell, Pascal, Volta
+// Works on Fermi, but id is one bit longer and is b11111
 #define NV_PFIFO_RUNLIST 0x00002274
+#define NV_PFIFO_ENG_RUNLIST(i) (0x00002284+(i)*8)
 typedef union {
+	// RUNLIST fields
 	struct {
 		uint32_t len:16;
 		 uint32_t padding:4;
-		uint32_t id:4;
+		uint32_t id:4; // Runlist ID (each engine may have a seperate runlist)
 		 uint32_t padding2:8;
 	} __attribute__((packed));
+	// ENG_RUNLIST fields that differ
+	struct {
+		 uint32_t padding3:20;
+		bool is_pending:1; // Is runlist not yet committed?
+		 uint32_t padding4:11;
+	} __attribute__((packed));
 	uint32_t raw;
 } runlist_info_t;
 
@@ -301,63 +360,631 @@ typedef union {
 	uint32_t raw;
 } runlist_disable_t;
 
+/* Read GPU descriptors from the Master Controller (MC)
+
+  MINOR_REVISION  : Legacy (only used with Celvin in Nouveau)
+  MAJOR_REVISION  : Legacy (only used with Celvin in Nouveau)
+  IMPLEMENTATION  : Which implementation of the GPU architecture
+  ARCHITECTURE    : Which GPU architecture
+
+  CHIP_ID = IMPLEMENTATION + ARCHITECTURE << 4
+  CHIP_ID         : Unique ID of all chips since Kelvin
+
+  Support: Kelvin, Rankline, Curie, Tesla, Fermi, Kepler, Maxwell, Pascal,
+           Volta, Turing, Ampere
+*/
+#define NV_MC_BOOT_0 0x00000000
+#define NV_CHIP_ID_GP106 0x136 // Discrete GeForce GTX 1060
+#define NV_CHIP_ID_GV11B 0x15B // Jetson Xavier embedded GPU
+#define NV_CHIP_ID_KEPLER 0x0E0
+#define NV_CHIP_ID_VOLTA 0x140
+
+inline static const char* ARCH2NAME(uint32_t arch) {
+	switch (arch) {
+	case 0x01:
+		return "Celsius";
+	case 0x02:
+		return "Kelvin";
+	case 0x03:
+		return "Rankline";
+	case 0x04:
+	case 0x06: // 0x06 is (nForce 6XX integrated only)
+		return "Curie";
+	// 0x07 is unused/skipped
+	case 0x05: // First Tesla card was released before the nForce 6XX
+	case 0x08:
+	case 0x09:
+	case 0x0A:
+		return "Tesla";
+	// 0x0B is unused/skipped
+	case 0x0C:
+	case 0x0D:
+		return "Fermi";
+	case 0x0E:
+	case 0x0F:
+	case 0x11:
+		return "Kepler";
+	case 0x12:
+		return "Maxwell";
+	case 0x13:
+		return "Pascal";
+	case 0x14:
+	case 0x15: // Volta integrated
+		return "Volta";
+	case 0x16:
+		return "Turing";
+	case 0x17:
+		return "Ampere";
+	case 0x18:
+	case 0x19:
+		return "Hopper (?) or Lovelace (?)";
+	default:
+		if (arch < 0x19)
+			return "[unknown historical architecture]";
+		else
+			return "[future]";
+	}
+}
+
+typedef union {
+	// Fields as defined in the NVIDIA reference
+	struct {
+		uint32_t minor_revision:4;
+		uint32_t major_revision:4;
+		 uint32_t reserved:4;
+		 uint32_t padding0:8;
+		uint32_t implementation:4;
+		uint32_t architecture:5;
+		 uint32_t padding1:3;
+	} __attribute__((packed));
+	uint32_t raw;
+	// Arch << 4 + impl is also often used
+	struct {
+		 uint32_t padding2:20;
+		uint32_t chip_id:9;
+		 uint32_t padding3:3;
+	} __attribute__((packed));
+} mc_boot_0_t;
+
+enum DEVICE_INFO_TYPE {INFO_TYPE_NOT_VALID = 0, INFO_TYPE_DATA = 1, INFO_TYPE_ENUM = 2, INFO_TYPE_ENGINE_TYPE = 3};
+enum ENGINE_TYPES {
+	ENGINE_GRAPHICS = 0, // GRAPHICS [/compute]
+	ENGINE_COPY0 = 1, // [raw/physical] COPY #0
+	ENGINE_COPY1 = 2, // [raw/physical] COPY #1
+	ENGINE_COPY2 = 3, // [raw/physical] COPY #2
+
+	ENGINE_MSPDEC = 8, // Picture DECoder
+	ENGINE_MSPPP = 9, // [Video] Post Processing
+	ENGINE_MSVLD = 10, // [Video] Variable Length Decoder
+	ENGINE_MSENC = 11, // [Video] ENCoding
+	ENGINE_VIC = 12, // Video Image Compositor
+	ENGINE_SEC = 13, // SEquenCer [?]
+	ENGINE_NVENC0 = 14, // Nvidia Video ENCoder #0
+	ENGINE_NVENC1 = 15, // Nvidia Video ENCoder #1
+	ENGINE_NVDEC = 16, // Nvidia Video DECoder
+
+	ENGINE_IOCTRL = 18, // I/O ConTRoLler [of NVLINK at least]
+	ENGINE_LCE = 19, // Logical Copy Engine
+	ENGINE_GSP = 20, // Gpu System Processor
+	ENGINE_NVJPG = 21, // NVidia JPeG [Decoder] (Ampere+)
+};
+#define ENGINE_TYPES_LEN 22
+static const char* const ENGINE_TYPES_NAMES[ENGINE_TYPES_LEN] = {
+	"Graphics/Compute",
+	"COPY0",
+	"COPY1",
+	"COPY2",
+	"Unknown Engine ID#4",
+	"Unknown Engine ID#5",
+	"Unknown Engine ID#6",
+	"Unknown Engine ID#7",
+	"MSPDEC: Picture Decoder",
+	"MSPPP: Post Processing",
+	"MSVLD: Variable Length Decoder",
+	"MSENC: Encoder",
+	"VIC: Video Image Compositor",
+	"SEC: Sequencer",
+	"NVENC0: NVIDIA Video Encoder #0",
+	"NVENC1: NVIDIA Video Encoder #1",
+	"NVDEC: NVIDIA Video Decoder",
+	"Unknown Engine ID#17",
+	"IOCTRL: I/O Controller",
+	"LCE: Logical Copy Engine",
+	"GSP: GPU System Processor",
+	"NVJPG: NVIDIA JPEG Decoder",
+};
+
+/* GPU engine information and control register offsets
+  Each engine is described by one or more entries (terminated by an entry with
+  the `has_next_entry` flag unset) in the fixed-size PTOP_DEVICE_INFO table. A
+  typical device, such as the graphics/compute engine and any copy engines, are
+  described by three entries, one of each type.
+
+  The PTOP_DEVICE_INFO table is sparsely populated (entries of type
+  INFO_TYPE_NOT_VALID may be intermingled with valid entries), so any traversal
+  code should check all NV_PTOP_DEVICE_INFO__SIZE_1 entries and not terminate
+  upon reaching the first entry of INFO_TYPE_NOT_VALID.
+
+  INFO_TYPE          : Is this a DATA, ENUM, or ENGINE_TYPE table entry?
+  HAS_NEXT_ENTRY     : Does the following entry refer to the same engine?
+
+  == INFO_TYPE_DATA fields ==
+  PRI_BASE           : BAR0 base = (PRI_BASE << 12) aka 4k aligned.
+  INST_ID            : "Note that some instanced [engines] (such as logical copy
+                       engines aka LCE) share a PRI_BASE across all [engines] of
+                       the same engine type; such [engines] require an additional
+                       offset: instanced base = BAR0 base + stride * INST_ID.
+  FAULT_ID_IS_VALID  : Does this engine have its own bind point and fault ID
+                       with the MMU?
+  FAULT_ID           : "The MMU fault id used by this [engine]. These IDs
+                       correspond to the NV_PFAULT_MMU_ENG_ID define list."
+
+  == INFO_TYPE_ENUM fields ==
+  ENGINE_IS_VALID    : Is this engine a host engine?
+  ENGINE_ENUM        : "[T]he host engine ID for the current [engine] if it is
+                       a host engine, meaning Host can send methods to the
+                       engine. This id is used to index into any register array
+                       whose __SIZE_1 is equal to NV_HOST_NUM_ENGINES.  A given
+                       ENGINE_ENUM can be present for at most one device in the
+                       table.  Devices corresponding to all ENGINE_ENUM ids 0
+                       through NV_HOST_NUM_ENGINES - 1 must be present in the
+                       device info table."
+  RUNLIST_IS_VALID   : Is this engine a host engine with a runlist?
+  RUNLIST_ENUM       : "[T]he Host runlist ID on which methods for the current
+                       [engine] should be submitted... The runlist id is used to
+                       index into any register array whose __SIZE_1 is equal to
+                       NV_HOST_NUM_RUNLISTS. [Engines] corresponding to all
+                       RUNLIST_ENUM ids 0 through NV_HOST_NUM_RUNLISTS - 1 must
+                       be present in the device info table."
+  INTR_IS_VALID      : Does this device have an interrupt?
+  INTR_ENUM          : Interrupt ID for use with "the NV_PMC_INTR_*_DEVICE
+                       register bitfields."
+  RESET_IS_VALID     : Does this engine have a reset ID?
+  RESET_ENUM         : Reset ID for use indexing the "NV_PMC_ENABLE_DEVICE(i)
+                       and NV_PMC_ELPG_ENABLE_DEVICE(i) register bitfields."
+
+  == INFO_TYPE_ENGINE_TYPE fields ==
+  ENGINE_TYPE        : What type of engine is this? (see ENGINE_TYPES_NAMES) 
+
+  Support: Kepler, Maxwell, Pascal, Volta, Ampere
+  See dev_top.ref.txt of NVIDIA's open-gpu-doc for more info.
+*/
+#define NV_PTOP_DEVICE_INFO(i) (0x00022700+(i)*4)
+#define NV_PTOP_DEVICE_INFO__SIZE_1 64
+typedef union {
+	// DATA type fields
+	struct {
+		enum DEVICE_INFO_TYPE info_type:2;
+		bool fault_id_is_valid:1;
+		uint32_t fault_id:7;
+		 uint32_t padding0:2;
+		uint32_t pri_base:12;
+		 uint32_t padding1:2;
+		uint32_t inst_id:4;
+		uint32_t is_not_enum2:1;
+		bool has_next_entry:1;
+	} __attribute__((packed));
+	// ENUM type fields
+	struct {
+		 uint32_t padding2:2;
+		bool reset_is_valid:1;
+		bool intr_is_valid:1;
+		bool runlist_is_valid:1;
+		bool engine_is_valid:1;
+		 uint32_t padding3:3;
+		uint32_t reset_enum:5;
+		 uint32_t padding4:1;
+		uint32_t intr_enum:5;
+		 uint32_t padding5:1;
+		uint32_t runlist_enum:4;
+		 uint32_t padding6:1;
+		uint32_t engine_enum:4;
+		 uint32_t padding7:2;
+	} __attribute__((packed));
+	// ENGINE_TYPE type fields
+	struct {
+		 uint32_t padding8:2;
+		enum ENGINE_TYPES engine_type:29;
+		 uint32_t padding9:1;
+	} __attribute__((packed));
+	uint32_t raw;
+} ptop_device_info_t;
+
+#define NV_PTOP_SCAL_NUM_GPCS 0x00022430
+#define NV_PTOP_SCAL_NUM_TPC_PER_GPC 0x00022434
+#define NV_PTOP_SCAL_NUM_CES 0x00022444
+// PCE_MAP is Volta+ only
+#define NV_CE_PCE_MAP 0x00104028
+
+// GPC and TPC masks
+// Support: Maxwell+
+#define NV_FUSE_GPC 0x00021c1c
+#define NV_FUSE_TPC_FOR_GPC(i) (0x00021c38+(i)*4)
+
+/* Location of the 1Kb instance block with page tables for BAR1 and BAR2.
+  Support: Fermi+ (?), Pascal
+*/
+#define NV_PBUS_BAR1_BLOCK 0x00001704
+#define NV_PBUS_BAR2_BLOCK 0x00001714
+typedef union {
+	struct {
+		uint32_t ptr:28;
+		enum INST_TARGET target:2;
+		 uint32_t padding0:1;
+		bool is_virtual:1;
+	} __attribute__((packed));
+	uint32_t raw;
+	struct {
+		uint32_t map:30;
+		 uint32_t padding1:2;
+	} __attribute__((packed));
+} bar_config_block_t;
+
+/* BAR0 PRAMIN (Private RAM Instance) window configuration
+
+  BASE    : Base of window >> 16 in [TARGET] virtual address space
+  TARGET  : Which address space BASE points into
+
+  Note: This seems to be set to 0x0bff00000 - 0x0c0000000 at least sometimes
+
+  Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere
+*/
+#define NV_PBUS_BAR0_WINDOW 0x00001700
+#define NV_PRAMIN 0x00700000  // Goes until 0x00800000 (1MB window)
+#define NV_PRAMIN_LEN 0x00100000
+typedef union {
+	struct {
+		uint32_t base:24;
+		enum INST_TARGET target:2;
+		 uint32_t padding0:6;
+	} __attribute__((packed));
+	uint32_t raw;
+} bar0_window_t;
+
+// Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere
+#define NV_PRAMIN_PDB_CONFIG_OFF 0x200
+typedef union {
+	struct {
+		uint32_t target:2;
+		uint32_t vol:1;
+		 uint32_t padding0:1;
+		uint32_t fault_replay_tex:1;
+		uint32_t fault_replay_gcc:1;
+		 uint32_t padding1:4;
+		bool is_ver2:1;
+		bool is_64k_big_page:1;  // 128Kb otherwise
+		uint32_t page_dir_lo:20;
+		uint32_t page_dir_hi:32;
+	} __attribute__((packed));
+	uint64_t raw;
+} page_dir_config_t;
+
+/* Page directory entry
+
+  Note: Format changed with Pascal (how?)
+
+  Support: Pascal, Volta, Turing, Ampere
+*/
+// FIXME: PDE/PTEs are actually 64 bits =S
+// Important: Aperture keys are different with PDEs
+enum PD_TARGET {
+	PD_AND_TARGET_INVALID = 0,  // b000
+	PD_AND_TARGET_VID_MEM = 2,  // b010
+	PD_AND_TARGET_SYS_MEM_COHERENT = 4,  // b100
+	PD_AND_TARGET_SYS_MEM_NONCOHERENT = 6,  // b110
+	PTE_AND_TARGET_VID_MEM = 1,  // b001
+	PTE_AND_TARGET_PEER = 3,  // b011
+	PTE_AND_TARGET_SYS_MEM_COHERENT = 5,  // b101
+	PTE_AND_TARGET_SYS_MEM_NONCOHERENT = 7,  // b111
+};
+static inline char* pd_target_to_text(enum PD_TARGET t) {
+	switch (t) {
+		case PD_AND_TARGET_INVALID:
+			return "INVALID";
+		case PD_AND_TARGET_VID_MEM:
+		case PTE_AND_TARGET_VID_MEM:
+			return "VID_MEM";
+		case PTE_AND_TARGET_PEER:
+			return "PEER";
+		case PD_AND_TARGET_SYS_MEM_COHERENT:
+		case PTE_AND_TARGET_SYS_MEM_COHERENT:
+			return "SYS_MEM_COHERENT";
+		case PD_AND_TARGET_SYS_MEM_NONCOHERENT:
+		case PTE_AND_TARGET_SYS_MEM_NONCOHERENT:
+			return "SYS_MEM_NONCOHERENT";
+		default:
+			printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
+			return NULL;
+	}
+}
+
+// PDE/PTE V2 type
+// Note: As the meaning of target (bits 2:1) changes depending on if the entry
+//       is a PTE or not, this combines them into a single target field to
+//       simplify comparisons.
+// Support: Pascal, Turing, Ampere
+typedef union {
+	// Page Directory Entry (PDE)
+	struct {
+		bool is_pte:1;
+		 uint32_t __target:2;
+		bool is_volatile:1;
+		 uint32_t padding1:4;
+		uint32_t addr:24;
+	} __attribute__((packed));
+	// Page Table Entry (PTE)
+	struct {
+		enum PD_TARGET target:3;
+		 uint32_t __is_volatile:1;
+		bool is_encrypted:1;
+		bool is_privileged:1;
+		bool is_readonly:1;
+		bool atomics_disabled:1;
+		 uint32_t __addr:24;
+	} __attribute__((packed));
+	uint32_t raw;
+} page_dir_entry_t;
+
+// PDE/PTE V1 types
+// Support: Fermi, Kepler, Maxwell
+enum V1_PD_TARGET {
+	PD_TARGET_INVALID = 0,
+	PD_TARGET_VID_MEM = 1,
+	PD_TARGET_SYS_MEM_COHERENT = 2,
+	PD_TARGET_SYS_MEM_NONCOHERENT = 3,
+};
+// Page Directory Entry (PDE)
+typedef union {
+// Large page fields
+	struct {
+// 0:32
+		enum V1_PD_TARGET target:2;
+		 uint32_t padding0:2;
+		uint64_t addr:28;  // May be wider?
+// 32:63
+		 uint32_t padding2:3;
+		uint32_t is_volatile:1; // Might have counted wrong?
+		 uint32_t padding3:28;
+	} __attribute__((packed));
+// Small page fields
+	struct {
+// 0:32
+		 uint32_t padding00:32;
+// 32:63
+		enum V1_PD_TARGET alt_target:2;
+		uint32_t alt_is_volatile:1; // Might have counted wrong?
+		 uint32_t padding03:1;
+		uint64_t alt_addr:28;
+	} __attribute__((packed));
+	uint64_t raw;
+} page_dir_entry_v1_t;
+// Page Table Entry (PTE)
+// Reconstructed from info in Jetson nvgpu driver
+typedef union {
+	struct {
+// 0:32
+		bool is_present:1;
+		bool is_privileged:1;
+		bool is_readonly:1;
+		 uint32_t padding0:1;
+		uint64_t addr:28;
+// 32:63
+		bool is_volatile:1;
+		enum INST_TARGET:2;
+		 uint32_t padding1:1;
+		uint32_t kind:8;
+		uint32_t comptag:17;
+		 uint32_t padding2:1;
+		bool is_read_disabled:1;
+		bool is_write_disabled:1;
+	} __attribute__((packed));
+	uint64_t raw;
+} page_tbl_entry_v1_t;
+//enum V0_PDE_TYPE {NOT_PRESENT = 0, PAGE_64K = 1, PAGE_16K = 2, PAGE_4K = 3};
+//enum V0_PDE_SIZE {PDE_SZ_128K = 0, PDE_SZ_32K = 1, PDE_SZ_16K = 2, PDE_SZ_8K = 3};
+//static const int V0_PDE_SIZE2NUM[4] = {128*1024, 32*1024, 16*1024, 8*1024};
+/* PDE V0 (nv50/Tesla)
+typedef union {
+	struct {
+		enum V1_PDE_TYPE type:2;
+		enum INST_TARGET target:2;
+		 uint32_t padding0:1;
+		enum V1_PDE_SIZE sublevel_size:2;
+		 uint32_t padding1:5;
+		uint32_t addr:28;
+		 uint32_t padding2:24;
+	} __attribute__((packed));
+	uint64_t raw;
+} page_dir_entry_v1_t;*/
+/* PTE V0 (nv50)
+typedef union {
+	struct {
+		bool is_present:1;
+		 uint32_t padding3:2;
+		bool is_readonly:1;
+		enum INST_TARGET target:2;
+		bool is_privileged:1;
+		uint32_t contig_blk_sz:3;
+		 uint32_t padding4:2;
+		uint32_t addr:28;
+		uint32_t storage_type:7;  // ???
+		uint32_t compression_mode:2;  // ???
+		uint32_t compression_tag:12;  // ???
+		bool is_long_partition_cycle:1;  // ???
+		bool is_encrypted:1;
+		 uint32_t padding5:1;
+	} __attribute__((packed));
+	uint64_t raw;
+} page_tbl_entry_v1_t;*/
+
 // TODO(jbakita): Maybe put the above GPU types in a different file.
 
-#define for_chan_in_tsg(chan, tsg) \
-        for (chan = (struct runlist_chan*)(tsg + 1); \
-             (void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \
-             chan++)
+#define NV_PCI_VENDOR 0x10de
+struct nvdebug_state {
+	// Pointer to the mapped base address of the GPU control registers (obtained
+	// via ioremap() originally). For embedded GPUs, we extract this from their
+	// struct nvgpu_os_linux. For discrete GPUs, we create our own mapping of
+	// BAR0 with pci_iomap(). Access via nvgpu_readl/writel functions.
+	void __iomem *regs;
+	// Depending on the architecture, BAR2 or BAR3 are used to access PRAMIN
+	union {
+		void __iomem *bar2;
+		void __iomem *bar3;
+	};
+	int chip_id;
+	// Additional state from the built-in driver. Only set iff
+	// chip_id == NV_CHIP_ID_GV11B
+	struct gk20a *g;
+	// Pointer to PCI device needed for pci_iounmap
+	struct pci_dev *pcid;
+};
+
+/*const struct runlist_funcs {
+	u8 size;
+	enum ENTRY_TYPE (*entry_type)(struct nvdebug_state *, void *);
+	uint32_t (*chid)(struct nvdebug_state *, void *);
+	uint32_t (*inst_ptr_lo)(struct nvdebug_state *, void *);
+	enum INST_TARGET (*inst_target)(struct nvdebug_state *, void *):
+	uint32_t (*tsgid)(struct nvdebug_state *, void *);
+	uint32_t (*timeslice_scale)(struct nvdebug_state *, void *);
+	uint32_t (*timeslice_timeout)(struct nvdebug_state *, void *);
+	uint32_t (*tsg_length)(struct nvdebug_state *, void *);
+};*/
+
+// This disgusting macro is a crutch to work around the fact that runlists were
+// different prior to Volta.
+#define VERSIONED_RL_ACCESSOR(_ENTRY_TYPE, type, prop) \
+	__attribute__((unused)) \
+	static type (prop)(const struct nvdebug_state *g, const void *raw) { \
+		if (g->chip_id > NV_CHIP_ID_VOLTA) { \
+			const struct gv100_runlist_ ## _ENTRY_TYPE *entry = (struct gv100_runlist_ ## _ENTRY_TYPE*)raw; \
+			return entry->prop; \
+		} else if (g->chip_id > NV_CHIP_ID_KEPLER) { \
+			const struct gk110_runlist_ ## _ENTRY_TYPE *entry = (struct gk110_runlist_ ## _ENTRY_TYPE*)raw; \
+			return entry->prop; \
+		} else { \
+			printk(KERN_WARNING "[nvdebug] " #prop " unavailable on GPU ID %x, which is older than Kepler.\n", g->chip_id); \
+			return (type)0; \
+		} \
+	}
+
+VERSIONED_RL_ACCESSOR(chan, uint32_t, chid);
+VERSIONED_RL_ACCESSOR(chan, uint32_t, inst_ptr_lo);
+VERSIONED_RL_ACCESSOR(chan, enum INST_TARGET, inst_target);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, tsgid);
+VERSIONED_RL_ACCESSOR(tsg, enum ENTRY_TYPE, entry_type);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, timeslice_scale);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, timeslice_timeout);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, tsg_length);
 
-#define next_tsg(tsg) \
-        (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length
+
+#define NV_RL_ENTRY_SIZE(g) \
+	 ((g)->chip_id >= NV_CHIP_ID_VOLTA ? sizeof(struct gv100_runlist_tsg) : sizeof(struct gk110_runlist_tsg))
+
+#define for_chan_in_tsg(g, chan, tsg) \
+        for (chan = (typeof(chan))(((u8*)tsg) + NV_RL_ENTRY_SIZE(g)); \
+             (u8*)chan < ((u8*)tsg) + (1 + tsg_length(g, tsg)) * NV_RL_ENTRY_SIZE(g); \
+             chan = (typeof(chan))(((u8*)chan) + NV_RL_ENTRY_SIZE(g)))
+
+#define next_tsg(g, tsg) \
+        (typeof(tsg))((u8*)(tsg) + NV_RL_ENTRY_SIZE(g) * (tsg_length(g, tsg) + 1))
 
 struct runlist_iter {
-	struct entry_tsg *curr_tsg;
+	// Pointer to either a TSG or channel entry (they're the same size)
+	void *curr_entry;
+	// This should be set to tsg_length when a TSG is reached, and
+	// decremented as each subsequent channel is printed. This allows us to
+	// track which channel are and are not part of the TSG.
+	int channels_left_in_tsg;
+	// Total runlist length, etc
 	runlist_info_t rl_info;
 };
 
+#define NVDEBUG_MAX_DEVICES 8
+extern struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
+
 // Defined in runlist.c
-struct gk20a* get_live_gk20a(void);
-int get_runlist_iter(struct runlist_iter *rl_iter);
-int preempt_tsg(uint32_t tsg_id);
+int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl_iter);
+int preempt_tsg(struct nvdebug_state *g, uint32_t tsg_id);
+
+// Defined in mmu.c
+uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr);
+void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy);
+uint64_t search_page_directory(
+	struct nvdebug_state *g,
+	void __iomem *pde_offset,
+	void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+	uint64_t addr_to_find);
+uint64_t search_v1_page_directory(
+	struct nvdebug_state *g,
+	void __iomem *pde_offset,
+	void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+	uint64_t addr_to_find);
+
 
 static inline struct gk20a *get_gk20a(struct device *dev) {
         // XXX: Only works because gk20a* is the first member of gk20a_platform
         return *((struct gk20a**)dev_get_drvdata(dev));
 }
 
-// Functionally identical to nvgpu_readl()
+// We us the data field of the proc_dir_entry ("PDE" in this function) to store
+// our index into the g_nvdebug_state array
+static inline int seq2gpuidx(struct seq_file *s) {
+        const struct file *f = s->file;
+        return (uintptr_t)PDE_DATA(file_inode(f));
+}
+static inline int file2gpuidx(const struct file *f) {
+        return (uintptr_t)PDE_DATA(file_inode(f));
+}
+static inline int file2parentgpuidx(const struct file *f) {
+	// Should be safe to call on ProcFS entries, as our parent should (?)
+	// still exist if we're called. If not, there are worse races in this
+	// module.
+	return (uintptr_t)PDE_DATA(file_dentry(f)->d_parent->d_inode);
+}
+
+#define gk20a_regs(gk20a) (container_of(gk20a, struct nvgpu_os_linux, g)->regs)
+
+// Similar to nvgpu_readl()
 // (except we don't try to resolve situations where regs is NULL)
-static inline u32 nvdebug_readl(struct gk20a* g, u32 r) {
-        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
-        if (unlikely(!g_os->regs)) {
-                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
-                return -1;
-        }
-        return readl(g_os->regs + r);
+static inline u32 nvdebug_readl(struct nvdebug_state *s, u32 r) {
+	if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
+		printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
+		return -1;
+	}
+	return readl(s->regs + r);
 }
 
 // quadword version of nvdebug_readl()
-static inline u64 nvdebug_readq(struct gk20a* g, u32 r) {
-        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
-        u64 ret;
-	if (unlikely(!g_os->regs)) {
-                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
-                return -1;
-        }
+static inline u64 nvdebug_readq(struct nvdebug_state *s, u32 r) {
+	u64 ret;
+	if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
+		printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
+		return -1;
+	}
 	// readq seems to always return the uppermost 32 bits as 0, so workaround with readl
-	ret = readl(g_os->regs + r);
-	ret |= ((u64)readl(g_os->regs + r + 4)) << 32;
+	ret = readl(s->regs + r);
+	ret |= ((u64)readl(s->regs + r + 4)) << 32;
 	return ret;
 }
 
-// Functionally identical to nvgpu_writel()
-static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {
-        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
-        if (unlikely(!g_os->regs)) {
+// Similar to nvgpu_writel()
+static inline void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) {
+	if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
+		printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
+		return;
+	}
+	writel_relaxed(v, s->regs + r);
+	wmb();
+}
+
+// quadword version of nvdebug_writel()
+// XXX: This probably doesn't work XXX: Untested
+static inline void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) {
+	if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
                 printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
                 return;
         }
-        writel_relaxed(v, g_os->regs + r);
+        writeq_relaxed(v, s->regs + r);
         wmb();
 }
diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index 0854b8b..695b5fd 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -2,64 +2,282 @@
  * SPDX-License-Identifier: MIT
  */
 
-/* TODO
- * - Add sysfs trigger for a preemption
- */
-
 #include <linux/device.h>  // For struct device, bus_find_device*(), struct bus_type
+#include <linux/interrupt.h> // For hooking the nvidia driver interrupts
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/proc_fs.h> // So we can set up entries in /proc
+#include <linux/pci.h>  // For PCI device scanning
+#include <linux/proc_fs.h>  // So we can set up entries in /proc
 
 #include "nvdebug.h"
+#include "stubs.h"
 
-// LIAR. But without this we can't use GPL-only exported symbols like
+// MIT is GPL-compatible. We need to be GPL-compatible for symbols like
 // platform_bus_type or bus_find_device_by_name...
-MODULE_LICENSE("GPL");
+MODULE_LICENSE("Dual MIT/GPL");
 MODULE_AUTHOR("Joshua Bakita");
 MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");
-MODULE_SOFTDEP("pre: nvgpu"); // We only support the Jetson boards for now
 
 extern const struct file_operations runlist_file_ops;
 extern const struct file_operations preempt_tsg_file_ops;
 extern const struct file_operations disable_channel_file_ops;
 extern const struct file_operations enable_channel_file_ops;
 extern const struct file_operations switch_to_tsg_file_ops;
+extern const struct file_operations device_info_file_ops;
+extern const struct file_operations nvdebug_read_reg32_file_ops;
+
+// Bus types are global symbols in the kernel
+extern struct bus_type platform_bus_type;
+struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
+unsigned int g_nvdebug_devices = 0;
+
+// TEMP
+irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) {
+	printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num);
+	return IRQ_NONE; // We don't actually handle any interrupts. Pass them on.
+}
+
+// Find any and all NVIDIA GPUs in the system
+// Note: This function fails if any of them are in a bad state
+int probe_and_cache_device(void) {
+	// platform bus (SoC) iterators
+	struct device *dev = NULL;
+	struct device *temp_dev;
+	// PCI search iterator and search query
+	struct pci_dev *pcid = NULL;
+	// This query pattern is mirrored off nouveau
+	struct pci_device_id query = {
+		.vendor = NV_PCI_VENDOR,  // Match NVIDIA devices
+		.device = PCI_ANY_ID,
+		.subvendor = PCI_ANY_ID,
+		.subdevice = PCI_ANY_ID,
+		.class_mask = 0xff << 16,
+		.class = PCI_BASE_CLASS_DISPLAY << 16,  // Match display devs
+	};
+	int i = 0;
+	// Search the platform bus for the first device that matches our name
+	// Search for GV10B (Jetson Xavier)
+	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b")))
+		dev = temp_dev;
+	// Search for GP10B (Jetson TX2)
+	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gp10b")))
+		dev = temp_dev;
+	// TODO: Support other platform bus devices (gk20a, gm20b)
+	if (dev) {
+		struct nvgpu_os_linux *l;
+		mc_boot_0_t ids;
+		g_nvdebug_state[i].g = get_gk20a(dev);
+		l = container_of(g_nvdebug_state[i].g, struct nvgpu_os_linux, g);
+		g_nvdebug_state[i].regs = l->regs;
+		if (!g_nvdebug_state[i].regs)
+			return -EADDRNOTAVAIL;
+		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
+		if (ids.raw == -1)
+			return -EADDRNOTAVAIL;
+		g_nvdebug_state[i].chip_id = ids.chip_id;
+		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on platform bus and initialized.",
+		       ids.chip_id, ARCH2NAME(ids.architecture));
+		i++;
+	}
+	// Search the PCI bus and iterate through all matches
+	// FIXME: State rollback
+	while ((pcid = pci_get_dev_by_id(&query, pcid)) && i < NVDEBUG_MAX_DEVICES) {
+		mc_boot_0_t ids;
+		g_nvdebug_state[i].g = NULL;
+		// Map BAR0 (GPU control registers)
+		g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0);
+		if (!g_nvdebug_state[i].regs) {
+			pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n");
+			return -EADDRNOTAVAIL;
+		}
+		// Map BAR3 (CPU-accessible mappings of GPU DRAM)
+		g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, 0);
+		// Try mapping only the lower half of BAR3 on fail
+		// (vesafb may map the top half for display)
+		if (!g_nvdebug_state[i].bar3)
+			g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2);
+		g_nvdebug_state[i].pcid = pcid;
+		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
+		if (ids.raw == -1) {
+			pci_err(pcid, "[nvdebug] Unable to read config from Master Controller on this GPU\n");
+			return -EADDRNOTAVAIL;
+		}
+		g_nvdebug_state[i].chip_id = ids.chip_id;
+		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.",
+		       ids.chip_id, ARCH2NAME(ids.architecture));
+		// TEMP
+		if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) {
+			printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n");
+		}
+		i++;
+	}
+	// Return the number of devices we found
+	if (i > 0)
+		return i;
+	return -ENODEV;
+}
+
+// Create files `/proc/gpu#/runlist#`, world readable
+int create_runlist_files(int device_id, struct proc_dir_entry *dir) {
+	ptop_device_info_t info;
+	struct proc_dir_entry *rl_entry;
+	int i, rl_id;
+	char runlist_name[12];
+	int max_rl_id = 0; // Always at least one runlist
+	// Figure out how many runlists there are by checking the device info
+	// registers. Runlists are always numbered sequentially, so we just have
+	// to find the highest-valued one and add 1 to get the number of runlists.
+	for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1; i++) {
+		info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO(i));
+		if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid)
+			continue;
+		if (info.runlist_enum > max_rl_id)
+			max_rl_id = info.runlist_enum;
+	}
+	// Create files to read each runlist. The read handling code looks at the
+	// PDE_DATA associated with the file to determine what the runlist ID is.
+	for (rl_id = 0; rl_id <= max_rl_id; rl_id++) {
+		snprintf(runlist_name, 12, "runlist%d", rl_id);
+		rl_entry = proc_create_data(
+			runlist_name, 0444, dir, &runlist_file_ops,
+			(void*)(uintptr_t)rl_id);
+		if (!rl_entry)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+// Create files /proc/gpu#
+// TODO: Don't run this on unsupported GPUs
+int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
+	char file_name[20];
+	int i;
+	struct proc_dir_entry *gpc_tpc_mask_entry;
+	// Get a bitmask of which GPCs are disabled
+	uint32_t gpcs_mask = nvdebug_readl(&g_nvdebug_state[device_id], NV_FUSE_GPC);
+	// Get maximum number of enabled GPCs for this chip
+	uint32_t max_gpcs = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_SCAL_NUM_GPCS);
+	// For each enabled GPC, expose a mask of disabled TPCs
+	for (i = 0; i < max_gpcs; i++) {
+		// Do nothing if GPC is disabled
+		if ((1 << i) & gpcs_mask)
+			continue;
+		// If GPC is enabled, create an entry to read disabled TPCs mask
+		snprintf(file_name, 20, "gpc%d_tpc_mask", i);
+		gpc_tpc_mask_entry = proc_create_data(
+			file_name, 0444, dir, &nvdebug_read_reg32_file_ops,
+			(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC(i));
+		if (!gpc_tpc_mask_entry)
+			return -ENOMEM;
+	}
+	return 0;
+}
 
 int __init nvdebug_init(void) {
-	struct proc_dir_entry *rl_entry, *preempt_entry, *disable_channel_entry,
-			      *enable_channel_entry, *switch_to_tsg_entry;
-	// Create file `/proc/preempt_tsg`, world readable
-	rl_entry = proc_create("runlist", 0444, NULL, &runlist_file_ops);
-	// Create file `/proc/preempt_tsg`, world writable
-	preempt_entry = proc_create("preempt_tsg", 0222, NULL, &preempt_tsg_file_ops);
-	// Create file `/proc/disable_channel`, world writable
-	disable_channel_entry = proc_create("disable_channel", 0222, NULL, &disable_channel_file_ops);
-	// Create file `/proc/enable_channel`, world writable
-	enable_channel_entry = proc_create("enable_channel", 0222, NULL, &enable_channel_file_ops);
-	// Create file `/proc/switch_to_tsg`, world writable
-	switch_to_tsg_entry = proc_create("switch_to_tsg", 0222, NULL, &switch_to_tsg_file_ops);
-	// ProcFS entry creation only fails if out of memory
-	if (!rl_entry || !preempt_entry || !disable_channel_entry || !enable_channel_entry || !switch_to_tsg_entry) {
-		remove_proc_entry("runlist", NULL);
-		remove_proc_entry("preempt_tsg", NULL);
-		remove_proc_entry("disable_channel", NULL);
-		remove_proc_entry("enable_channel", NULL);
-		remove_proc_entry("switch_to_tsg", NULL);
-		printk(KERN_ERR "[nvdebug] Unable to initialize procfs entries!\n");
-		return -ENOMEM;
+	struct proc_dir_entry *dir, *preempt_entry, *disable_channel_entry,
+			      *enable_channel_entry, *switch_to_tsg_entry, *device_info_entry,
+			      *num_gpcs_entry;
+	int rl_create_err, tpc_masks_create_err;
+	// Check that an NVIDIA GPU is present and initialize g_nvdebug_state
+	int res = probe_and_cache_device();
+	if (res < 0)
+		return res;
+	g_nvdebug_devices = res;
+	// Create seperate ProcFS directories for each gpu
+	while (res--) {
+		char device_id_str[7];
+		uintptr_t device_id = res;  // This is uintptr as we abuse the *data field on proc_dir_entry to store the GPU id
+		// Create directory /proc/gpu# where # is the GPU number
+		snprintf(device_id_str, 7, "gpu%ld", device_id);
+		if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id)))
+			goto out_nomem;
+		// Create files `/proc/gpu#/runlist#`, world readable
+		rl_create_err = create_runlist_files(device_id, dir);
+		// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable
+		tpc_masks_create_err = create_tpc_mask_files(device_id, dir);
+		// Create file `/proc/gpu#/preempt_tsg`, world writable
+		preempt_entry = proc_create_data(
+			"preempt_tsg", 0222, dir, &preempt_tsg_file_ops,
+			(void*)device_id);
+		// Create file `/proc/gpu#/disable_channel`, world writable
+		disable_channel_entry = proc_create_data(
+			"disable_channel", 0222, dir, &disable_channel_file_ops,
+			(void*)device_id);
+		// Create file `/proc/gpu#/enable_channel`, world writable
+		enable_channel_entry = proc_create_data(
+			"enable_channel", 0222, dir, &enable_channel_file_ops,
+			(void*)device_id);
+		// Create file `/proc/gpu#/switch_to_tsg`, world writable
+		switch_to_tsg_entry = proc_create_data(
+			"switch_to_tsg", 0222, dir, &switch_to_tsg_file_ops,
+			(void*)device_id);
+		// Create file `/proc/gpu#/device_info`, world readable
+		device_info_entry = proc_create_data(
+			"device_info", 0444, dir, &device_info_file_ops,
+			(void*)device_id);
+		// Create file `/proc/gpu#/num_gpcs`, world readable
+		num_gpcs_entry = proc_create_data(
+			"num_gpcs", 0444, dir, &nvdebug_read_reg32_file_ops,
+			(void*)NV_PTOP_SCAL_NUM_GPCS);
+		// Create file `/proc/gpu#/num_tpc_per_gpc`, world readable
+		num_gpcs_entry = proc_create_data(
+			"num_tpc_per_gpc", 0444, dir, &nvdebug_read_reg32_file_ops,
+			(void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC);
+		// Create file `/proc/gpu#/num_ces`, world readable
+		num_gpcs_entry = proc_create_data(
+			"num_ces", 0444, dir, &nvdebug_read_reg32_file_ops,
+			(void*)NV_PTOP_SCAL_NUM_CES);
+		// Create file `/proc/gpu#/num_ces`, world readable
+		num_gpcs_entry = proc_create_data(
+			"gpc_mask", 0444, dir, &nvdebug_read_reg32_file_ops,
+			(void*)NV_FUSE_GPC);
+		// In both nouveau and nvgpu, the PCE_MAP register is only available on Volta+
+		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_VOLTA) {
+			// TODO: Redo to num_pces
+			// Create file `/proc/gpu#/pce_map`, world readable
+			num_gpcs_entry = proc_create_data(
+				"pce_map", 0444, dir, &nvdebug_read_reg32_file_ops,
+				(void*)NV_CE_PCE_MAP);
+		}
+		// ProcFS entry creation only fails if out of memory
+		if (rl_create_err || tpc_masks_create_err || !preempt_entry ||
+		    !disable_channel_entry || !enable_channel_entry ||
+		    !switch_to_tsg_entry || !device_info_entry || !num_gpcs_entry)
+			goto out_nomem;
 	}
+	// (See Makefile if you want to know the origin of GIT_HASH.)
 	printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n");
 	return 0;
+out_nomem:
+	// Make sure to clear all ProcFS directories on error
+	while (res < g_nvdebug_devices) {
+		char device_id_str[7];
+		snprintf(device_id_str, 7, "gpu%d", res);
+		remove_proc_subtree(device_id_str, NULL);
+		res++;
+	}
+	return -ENOMEM;
 }
 
 static void __exit nvdebug_exit(void) {
-	remove_proc_entry("runlist", NULL);
-	remove_proc_entry("preempt_tsg", NULL);
-	remove_proc_entry("disable_channel", NULL);
-	remove_proc_entry("enable_channel", NULL);
-	remove_proc_entry("switch_to_tsg", NULL);
-	printk(KERN_INFO "[nvdebug] Exiting...\n");
+	struct nvdebug_state *g;
+	// Deinitialize each device
+	while (g_nvdebug_devices--) {
+		// Remove procfs directory
+		char device_id[7];
+		snprintf(device_id, 7, "gpu%d", g_nvdebug_devices);
+		remove_proc_subtree(device_id, NULL);
+		// Free BAR mappings
+		g = &g_nvdebug_state[g_nvdebug_devices];
+		if (g && g->regs)
+			pci_iounmap(g->pcid, g->regs);
+		if (g && g->bar2)
+			pci_iounmap(g->pcid, g->bar2);
+		// TEMP
+		free_irq(g->pcid->irq, g->pcid);
+		printk(KERN_INFO "[nvdebug] Chip ID %x deinitialized.", g->chip_id);
+	}
+	printk(KERN_INFO "[nvdebug] Module exit complete.\n");
 }
 
 module_init(nvdebug_init);
diff --git a/runlist.c b/runlist.c
index c8ff99f..94be18e 100644
--- a/runlist.c
+++ b/runlist.c
@@ -1,122 +1,127 @@
-#include <linux/device.h>  // For struct device, bus_find_device*(), struct bus_type
-//#include <linux/iommu.h>  // For struct iommu_domain
 #include <linux/kernel.h>  // Kernel types
-#include <asm/io.h>
 
 #include "nvdebug.h"
 
-// Bus types are global symbols in the kernel
-extern struct bus_type platform_bus_type;
-
-struct gk20a* get_live_gk20a(void) {
-        struct device *dev = NULL;
-        struct device *temp_dev;
-        struct gk20a *g;
-	struct nvgpu_os_linux *l;
-        // Get the last device that matches our name
-        while ((temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b"))) {
-                dev = temp_dev;
-                printk(KERN_INFO "[nvdebug] Found a matching device %s\n", dev_name(dev));
-        }
-        if (!dev)
-                return NULL;
-        g = get_gk20a(dev);
-        // The address pointed to `regs` + NV_PFIFO_RUNLIST_BASE seems to not be:
-        // - A GPU address (type is sysmem_coherent)
-        // - A physical address (dereferencing after ioremap crashes)
-        // - A kernel virtual address (dereferencing segfaults)
-        // So maybe it's some sort of custom thing? This is an address that the GPU
-        // can use, so it would make most sense for it to be a physical address.
-        //
-        // BUT, it can't possibly be a physical address, as it would refer to an
-        // address greater than the maximum one on our system (by a lot!).
-        // Maybe I'm reading the runlist base wrong?
-        // Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual
-        // address! So, what's this I/O address space? All I know is that it's what
-        // nvgpu_mem_get_addr() returns. That function returns the result of either:
-        // - gpu_phys_addr which is  __nvgpu_sgl_phys on our platform which (?)
-        //   converts an IPA to a PA?
-        // - nvgpu_mem_iommu_translate
-        //
-        // The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which
-        // returns SYSMEM.
-        //
-        // To convert a physical address to a IOMMU address, we add a bit
-        //
-        // BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working
-        // before because the GPU had simply gone to sleep and invalidated its
-        // register state, so nvgpu_readl() was simply returning garbage.
-        l = container_of(g, struct nvgpu_os_linux, g);
-        if (!l->regs)
-                return NULL;
-	return g;
-}
-
 /* Get runlist head and info (incl. length)
    @param rl_iter Location at which to store output
+   @param rl_id   Which runlist to obtain?
 */
-int get_runlist_iter(struct runlist_iter *rl_iter) {
-        struct entry_tsg head;
-        runlist_base_t rl_base;
-        runlist_info_t rl_info;
-        u64 runlist_iova;
-	struct gk20a *g = get_live_gk20a();
-	if (!g)
+int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl_iter) {
+	runlist_base_t rl_base;
+	runlist_info_t rl_info;
+	u64 runlist_iova;
+	*rl_iter = (struct runlist_iter){0};
+	rl_base.raw = nvdebug_readl(g, NV_PFIFO_ENG_RUNLIST_BASE(rl_id));
+	// Check that reads are working
+	if (rl_base.raw == -1)
 		return -EIO;
-        rl_base.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST_BASE);
-        rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST);
-        runlist_iova = ((u64)rl_base.ptr) << 12;
-        printk(KERN_INFO "[nvdebug] Runlist ptr: %x, type: %d, raw: %x, IOVA: %px\n",
-		rl_base.ptr, rl_base.type, rl_base.raw, (void*)runlist_iova);
-	// TODO: Support reading video memory
-	if (rl_base.type == TARGET_VID_MEM) {
-		printk(KERN_ERR "[nvdebug] Runlist is located in video memory. Access to video memory is unimplemented.");
-		return -ENOTSUPP;
+	// The address pointed to `regs` + NV_PFIFO_RUNLIST_BASE seems to not be:
+	// - A GPU address (type is sysmem_coherent)
+	// - A physical address (dereferencing after ioremap crashes)
+	// - A kernel virtual address (dereferencing segfaults)
+	// So maybe it's some sort of custom thing? This is an address that the GPU
+	// can use, so it would make most sense for it to be a physical address.
+	//
+	// BUT, it can't possibly be a physical address, as it would refer to an
+	// address greater than the maximum one on our system (by a lot!).
+	// Maybe I'm reading the runlist base wrong?
+	// Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual
+	// address! So, what's this I/O address space? All I know is that it's what
+	// nvgpu_mem_get_addr() returns. That function returns the result of either:
+	// - gpu_phys_addr which is  __nvgpu_sgl_phys on our platform which (?)
+	//   converts an IPA to a PA?
+	// - nvgpu_mem_iommu_translate
+	//
+	// The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which
+	// returns SYSMEM.
+	//
+	// To convert a physical address to a IOMMU address, we add a bit
+	//
+	// BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working
+	// before because the GPU had simply gone to sleep and invalidated its
+	// register state, so nvgpu_readl() was simply returning garbage.
+	rl_info.raw = nvdebug_readl(g, NV_PFIFO_ENG_RUNLIST(rl_id));
+	runlist_iova = ((u64)rl_base.ptr) << 12;
+	printk(KERN_INFO "[nvdebug] Runlist %d @ %llx in %s (config raw: %x)\n",
+	       rl_id, runlist_iova, target_to_text(rl_base.target), rl_base.raw);
+	printk(KERN_INFO "[nvdebug] Runlist length %d, ID %d\n", rl_info.len, rl_info.id);
+	// Return early on an empty runlist
+	if (!rl_info.len)
+		return 0;
+	// If the runlist is in VID_MEM, search the BAR2/3 page tables for a mapping
+	if (rl_base.target == TARGET_VID_MEM) {
+		printk(KERN_WARNING "[nvdebug] Runlist is located in video memory. Access to video memory is experimental.");
+		bar_config_block_t bar1_block, bar2_block;
+		bar1_block.raw = nvdebug_readl(g, NV_PBUS_BAR1_BLOCK);
+		printk(KERN_INFO "[nvdebug] BAR1 inst block @ %llx in %s's %s address space.\n", ((u64)bar1_block.ptr) << 12, target_to_text(bar1_block.target), bar1_block.is_virtual ? "virtual" : "physical");
+		bar2_block.raw = nvdebug_readl(g, NV_PBUS_BAR2_BLOCK);
+		printk(KERN_INFO "[nvdebug] BAR2 inst block @ %llx in %s's %s address space.\n", ((u64)bar2_block.ptr) << 12, target_to_text(bar2_block.target), bar1_block.is_virtual ? "virtual" : "physical");
+		uint32_t bar_inst_pramin_offset = vram2PRAMIN(g, (uint64_t)bar2_block.ptr << 12);
+		if (!bar_inst_pramin_offset) {
+			printk(KERN_WARNING "[nvdebug] Unable to find instance block for BAR2/3 in the current NV_PRAMIN window. VRAM inaccessible.\n");
+			return -EOPNOTSUPP;
+		}
+		/* TODO: Support BAR1?
+		bar_inst_pramin_offset = vram2PRAMIN(g, bar1_block.ptr << 12);
+		if (!bar_inst_pramin_offset) {
+			printk(KERN_WARNING "[nvdebug] Unable to find instance block for BAR1 in the current NV_PRAMIN window. VRAM inaccessible.\n");
+			return -EOPNOTSUPP;
+		}*/
+		// Instance blocks (size == 1kb) contain many things, but we only care about
+		// the section which describes the location of the page directory (page table)
+		uint32_t bar_pdb_config_pramin_offset = bar_inst_pramin_offset + NV_PRAMIN_PDB_CONFIG_OFF;
+		page_dir_config_t pd_config;
+		pd_config.raw = nvdebug_readq(g, bar_pdb_config_pramin_offset + NV_PRAMIN);
+		uint64_t bar_pdb_vram_addr = pd_config.page_dir_hi;
+		bar_pdb_vram_addr <<= 20;
+		bar_pdb_vram_addr |= pd_config.page_dir_lo;
+		bar_pdb_vram_addr <<= 12;
+		printk(KERN_INFO "[nvdebug] BAR2 PDB @ %llx in %s of version %s (config raw: %llx)\n", bar_pdb_vram_addr, target_to_text(pd_config.target), pd_config.is_ver2 ? "2" : "1", pd_config.raw);
+		// TODO: SYSMEM support for page table location
+		if (pd_config.target != TARGET_VID_MEM) {
+			printk(KERN_WARNING "[nvdebug] BAR2 PDB is in an unsupported location.\n");
+			return -EOPNOTSUPP;
+		}
+		uint32_t bar_pdb_pramin_offset = vram2PRAMIN(g, bar_pdb_vram_addr);
+		if (!bar_pdb_pramin_offset) {
+			printk(KERN_WARNING "[nvdebug] Unable to find page directory BAR2/3 in the current NV_PRAMIN window. VRAM inaccessible.\n");
+			return -EOPNOTSUPP;
+		}
+		uint64_t runlist_bar_vaddr;
+		if (pd_config.is_ver2)
+			runlist_bar_vaddr = search_page_directory(g, g->regs + NV_PRAMIN + bar_pdb_pramin_offset, phy2PRAMIN, runlist_iova);
+		else
+			runlist_bar_vaddr = search_v1_page_directory(g, g->regs + NV_PRAMIN + bar_pdb_pramin_offset, phy2PRAMIN, runlist_iova);
+		if (!runlist_bar_vaddr) {
+			printk(KERN_WARNING "[nvdebug] Unable to find runlist mapping in BAR2/3 page tables.\n");
+			return -EOPNOTSUPP;
+		}
+		printk(KERN_INFO "[nvdebug] Runlist @ %llx in BAR2 virtual address space.\n", runlist_bar_vaddr);
+		/* XXX: Old test code
+		uint32_t bar2_pd_pramin_offset = vram_to_pramin_off(bar2_pd);
+		//walk_pd_subtree(bar2_pd_pramin_offset);
+		uint64_t runlist_bar2_vaddr = search_pd_subtree(bar2_pd_pramin_offset, runlist_iova);
+		page_dir_entry_t pde_0;
+		pde_0.raw = nvdebug_readl(g, NV_PRAMIN + bar2_pd_pramin_offset);
+		uint32_t pde_1 = nvdebug_readl(g, NV_PRAMIN + vram_to_pramin_off(((u64)pde_0.addr) << 12));
+		uint64_t pde_bar2_vaddr = search_pd_subtree(bar2_pd_pramin_offset, ((u64)pde_0.addr) << 12);
+		uint32_t pde_2 = readl(g->bar3 + pde_bar2_vaddr);
+		printk(KERN_INFO "[nvdebug] PDE0 via PRAMIN: %x, via BAR3: %x\n", pde_1, pde_2);
+		*/
+		if (!g->bar3) {
+			printk(KERN_WARNING "[nvdebug] BAR2/3 not mapped.\n");
+			return -ENODEV;
+		}
+		rl_iter->curr_entry = g->bar2 + runlist_bar_vaddr;
+	} else {
+		// Directly access the runlist if stored in SYS_MEM (physically addressed)
+		rl_iter->curr_entry = phys_to_virt(runlist_iova);
 	}
-        // Segfaults
-        //u32 attempted_read = ioread32(runlist_iova);
-        //printk(KERN_INFO "[nvdebug] first word of runlist: %0x\n", attempted_read);
-
-        // Errors out
-        //u32* virt_rt_addr = ioremap(phys_rl_addr, sizeof(struct entry_tsg));
-        //printk(KERN_INFO "[nvdebug] Runlist virt_addr: %px\n", virt_rt_addr);
-
-        /* Overcomplicated?
-        struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
-        if (!domain) {
-                printk(KERN_INFO "[nvdebug] No IOMMU domain!\n");
-                return -EIO;
-        }
-        u64 phys_addr = platform_bus_type.iommu_ops->iova_to_phys(domain, runlist_iova);
-        printk(KERN_INFO "[nvdebug] Runlist PA: %px\n", phys_addr);
-        */
-
-        printk(KERN_INFO "[nvdebug] Runlist phys_to_virt:   %px\n", (void*)phys_to_virt(runlist_iova));
-        printk(KERN_INFO "[nvdebug] Runlist *phys_to_virt:  %x\n", *(u32*)phys_to_virt(runlist_iova));
-        head = *(struct entry_tsg*)phys_to_virt(runlist_iova);
-
-        rl_iter->curr_tsg = (struct entry_tsg*)phys_to_virt(runlist_iova);
-        rl_iter->rl_info = rl_info;
-        return 0;
-        //printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type);
-        //printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale);
-        //printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout);
-        //printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length);
-        //printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid);
-
-        //printk(KERN_INFO "[nvdebug] Mem base phys: %p\n", (void*)virt_to_phys((void*)0xffffffc000000000ULL));
-        //printk(KERN_INFO "[nvdebug] Mem end phys: %p\n", (void*)virt_to_phys((void*)0xffffffc400000000ULL));
-        //printk(KERN_INFO "[nvdebug] Runlist *virt_addr: %x\n", readl(virt_rt_addr)); // This crashes
-        //read_bytes(&head, virt_rt_addr, sizeof(struct entry_tsg));
-        /*printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type);
-        printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale);
-        printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout);
-        printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length);
-        printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid); */
+	rl_iter->rl_info = rl_info;
+	return 0;
 }
 
-int preempt_tsg(uint32_t tsg_id) {
-	struct gk20a *g = get_live_gk20a();
+int preempt_tsg(struct nvdebug_state *g, uint32_t tsg_id) {
 	runlist_info_t rl_info;
 	pfifo_preempt_t pfifo_preempt;
 	runlist_disable_t rl_disable;
diff --git a/runlist_procfs.c b/runlist_procfs.c
index 411f844..a6b0d94 100644
--- a/runlist_procfs.c
+++ b/runlist_procfs.c
@@ -6,7 +6,14 @@
 #define RUNLIST_PROCFS_NAME "runlist"
 #define DETAILED_CHANNEL_INFO
 
-static int runlist_detail_seq_show_chan(struct seq_file *s, struct gk20a *g, uint32_t chid) {
+/* Print channel details using PCCSR (Programmable Channel Control System RAM?)
+ * @param s      Pointer to state from seq_file subsystem to pass to seq_printf
+ * @param g      Pointer to our internal GPU state
+ * @param chid   ID of channel to print details on, range [0, 512)
+ * @param prefix Text string to prefix each line with, or empty string
+ */
+#ifdef DETAILED_CHANNEL_INFO
+static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix) {
 	channel_ctrl_t chan;
 	char *loc_txt;
 	u64 instance_ptr;
@@ -16,23 +23,37 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct gk20a *g, uin
 		return -EIO;
 	instance_ptr = chan.inst_ptr;
 	instance_ptr <<= 12;
-	seq_printf(s, "  +- Channel Info %-4d -+\n", chid);
-	seq_printf(s, "  | Enabled:           %d|\n", chan.enable);
-	seq_printf(s, "  | Next:              %d|\n", chan.next);
-	seq_printf(s, "  | Force CTX Reload:  %d|\n", chan.force_ctx_reload);
-	seq_printf(s, "  | Enable set:        %d|\n", chan.enable_set);
-	seq_printf(s, "  | Enable clear:      %d|\n", chan.enable_clear);
-	seq_printf(s, "  | PBDMA Faulted:     %d|\n", chan.pbdma_faulted);
-	seq_printf(s, "  | ENG Faulted:       %d|\n", chan.eng_faulted);
-	seq_printf(s, "  | Status:           %2d|\n", chan.status);
-	seq_printf(s, "  | Busy:              %d|\n", chan.busy);
-	seq_printf(s, "  | Instance PTR:       |\n");
-	seq_printf(s, "  | %#018llx  |\n", instance_ptr);
-	seq_printf(s, "  | %-20s|\n", loc_txt);
-	seq_printf(s, "  | Instance bound:    %d|\n", chan.inst_bind);
-	seq_printf(s, "  +---------------------+\n");
+	seq_printf(s, "%s+- Channel Info %-4d -+\n", prefix, chid);
+	seq_printf(s, "%s| Enabled:           %d|\n", prefix, chan.enable);
+	seq_printf(s, "%s| Next:              %d|\n", prefix, chan.next);
+	seq_printf(s, "%s| Force CTX Reload:  %d|\n", prefix, chan.force_ctx_reload);
+	seq_printf(s, "%s| Enable set:        %d|\n", prefix, chan.enable_set);
+	seq_printf(s, "%s| Enable clear:      %d|\n", prefix, chan.enable_clear);
+	seq_printf(s, "%s| PBDMA Faulted:     %d|\n", prefix, chan.pbdma_faulted);
+	seq_printf(s, "%s| ENG Faulted:       %d|\n", prefix, chan.eng_faulted);
+	seq_printf(s, "%s| Status:           %2d|\n", prefix, chan.status);
+	seq_printf(s, "%s| Busy:              %d|\n", prefix, chan.busy);
+	seq_printf(s, "%s| Instance PTR:       |\n", prefix);
+	seq_printf(s, "%s| %#018llx  |\n", prefix, instance_ptr);
+	seq_printf(s, "%s| %-20s|\n", prefix, loc_txt);
+	seq_printf(s, "%s| Instance bound:    %d|\n", prefix, chan.inst_bind);
+	// START TEMP
+	// "runlist_id -1 is synonym for the ENGINE_GR_GK20A runlist id"
+	// GR, GRCE, and ASYNC_CE
+	// Note that this appears to be broken??
+	// Peek into the channel instance RAM
+	if (chan.inst_target == TARGET_SYS_MEM_COHERENT) {
+		seq_printf(s, "%s| Target Engine:    %2d|\n", prefix, *(uint32_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*43/*NV_RAMFC_TARGET*/) & 0x1f);
+		seq_printf(s, "%s| PDB LO:   %#08x|\n", prefix, *(uint32_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*128/*NV_RAMIN_PAGE_DIR_BASE_LO*/) & 0xfffff000);
+		seq_printf(s, "%s| Num subcontexts:  %2ld|\n", prefix, hweight64(*(uint64_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*166/*NV_RAMIN_SC_PDB_VALID*/)));
+		// This appears to be unset on Xavier
+		//seq_printf(s, "%s| PAS ID:     %8ld|\n", prefix, *(uint32_t*)phys_to_virt(instance_ptr + 4/*bytes for 32bits*/*135/*NV_RAMIN_PASID*/) & 0xfffff);
+	}
+	// END TEMP
+	seq_printf(s, "%s+---------------------+\n", prefix);
 	return 0;
 }
+#endif
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0)
 // Bug workaround. See comment in runlist_file_seq_start()
@@ -41,10 +62,14 @@ static loff_t pos_fixup;
 
 static void *runlist_file_seq_start(struct seq_file *s, loff_t *pos) {
 	static struct runlist_iter rl_iter;
+	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)];
 	// *pos == 0 for first call after read of file
 	if (*pos == 0) {
-		int err = get_runlist_iter(&rl_iter);
+		int err = get_runlist_iter(g, seq2gpuidx(s), &rl_iter);
 		if (err)
+			return ERR_PTR(err);
+		// Don't try to print an empty runlist
+		if (rl_iter.rl_info.len <= 0)
 			return NULL;
 		return &rl_iter;
 	}
@@ -68,12 +93,13 @@ static void* runlist_file_seq_next(struct seq_file *s, void *raw_rl_iter,
 				   loff_t *pos) {
 	struct runlist_iter* rl_iter = raw_rl_iter;
 	void *ret = NULL;
-	// Advance by one TSG + channels under last TSG
-	*pos += 1 + rl_iter->curr_tsg->tsg_length;
+	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)];
+	// Advance by one TSG or channel
+	(*pos)++;
+	rl_iter->curr_entry += NV_RL_ENTRY_SIZE(g);
 	// Verify we haven't reached the end of the runlist
 	// rl_info.len is the num of tsg entries + total num of channel entries
 	if (*pos < rl_iter->rl_info.len) {
-		rl_iter->curr_tsg = next_tsg(rl_iter->curr_tsg);
 		ret = rl_iter;
 	}
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0)
@@ -88,57 +114,57 @@ static void runlist_file_seq_stop(struct seq_file *s, void *raw_rl_iter) {
 }
 
 static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) {
-	struct entry_tsg* tsg = ((struct runlist_iter*)raw_rl_iter)->curr_tsg;
-	struct runlist_chan* chan;
-	struct gk20a *g = get_live_gk20a();
-	if (!g)
-		return -EIO;
-	if (tsg->entry_type != ENTRY_TYPE_TSG) {
-                printk(KERN_WARNING "[nvdebug] Attempted to print non-TSG in tsg print logic!\n");
-                return -EIO;
-        }
-        seq_printf(s, "+---- TSG Entry %-2d----+\n", tsg->tsgid);
-        seq_printf(s, "| Scale: %-13d|\n", tsg->timeslice_scale);
-        seq_printf(s, "| Timeout: %-11d|\n", tsg->timeslice_timeout);
-        seq_printf(s, "+---------------------+\n");
-	for_chan_in_tsg(chan, tsg) {
+	struct runlist_iter *rl_iter = raw_rl_iter;
+	void *entry = rl_iter->curr_entry;
+	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)];
+	if (entry_type(g, entry) == ENTRY_TYPE_TSG) {
+		if (rl_iter->channels_left_in_tsg) {
+			printk(KERN_WARNING "[nvdebug] Found a TSG @ %px when %d channels were still expected under the previous TSG in the runlist!\n", entry, rl_iter->channels_left_in_tsg);
+			return -EIO;
+		}
+		rl_iter->channels_left_in_tsg = tsg_length(g, entry);
+		seq_printf(s, "+---- TSG Entry %-3d---+\n", tsgid(g, entry));
+		seq_printf(s, "| Scale: %-13d|\n", timeslice_scale(g, entry));
+		seq_printf(s, "| Timeout: %-11d|\n", timeslice_timeout(g, entry));
+		seq_printf(s, "| Length: %-12d|\n", tsg_length(g, entry));
+		seq_printf(s, "+---------------------+\n");
+	} else {
+		char *indt = "";
 #ifndef DETAILED_CHANNEL_INFO
-		char* loc_txt;
-		u64 instance_ptr;
+		u64 instance_ptr = 0;
 #endif
-		if (chan->entry_type != ENTRY_TYPE_CHAN) {
-			printk(KERN_WARNING "[nvdebug] Attempted to print non-channel in channel print logic!\n");
-			return -EIO;
+		if (rl_iter->channels_left_in_tsg) {
+			indt = "  ";
+			rl_iter->channels_left_in_tsg--;
 		}
 #ifdef DETAILED_CHANNEL_INFO
-		runlist_detail_seq_show_chan(s, g, chan->chid);
+		runlist_detail_seq_show_chan(s, g, chid(g, entry), indt);
 #else
-		loc_txt = target_to_text(chan->inst_target);
-		if (!loc_txt) {
-			printk(KERN_WARNING "[nvdebug] Invalid apature in channel print logic!\n");
-			return -EIO;
-		}
 		// Reconstruct pointer to channel instance block
-		instance_ptr = chan->inst_ptr_hi;
-		instance_ptr <<= 32;
-		instance_ptr |= chan->inst_ptr_lo << 12;
-
-		seq_printf(s, "  +- Channel Entry %-4d-+\n", chan->chid);
-		seq_printf(s, "  | Runqueue Selector: %d|\n", chan->runqueue_selector);
-		seq_printf(s, "  | Instance PTR:       |\n");
-		seq_printf(s, "  | %#018llx  |\n", instance_ptr);
-		seq_printf(s, "  | %-20s|\n", loc_txt);
-		seq_printf(s, "  +---------------------+\n");
+		if (g->chip_id >= NV_CHIP_ID_VOLTA) {
+			instance_ptr = ((struct gv100_runlist_chan*)entry)->inst_ptr_hi;
+			instance_ptr <<= 32;
+		}
+		instance_ptr |= inst_ptr_lo(g, entry) << 12;
+
+		seq_printf(s, "%s+- Channel Entry %-4d-+\n", indt, chid(g, entry));
+		if (g->chip_id >= NV_CHIP_ID_VOLTA)
+			seq_printf(s, "%s| Runqueue Selector: %d|\n", indt,
+				   ((struct gv100_runlist_chan*)entry)->runqueue_selector);
+		seq_printf(s, "%s| Instance PTR:       |\n", indt);
+		seq_printf(s, "%s| %#018llx  |\n", indt, instance_ptr);
+		seq_printf(s, "%s| %-20s|\n", indt, target_to_text(inst_target(g, entry)));
+		seq_printf(s, "%s+---------------------+\n", indt);
 #endif
 	}
 	return 0;
 }
 
 static const struct seq_operations runlist_file_seq_ops = {
-        .start = runlist_file_seq_start,
-        .next = runlist_file_seq_next,
-        .stop = runlist_file_seq_stop,
-        .show = runlist_file_seq_show,
+	.start = runlist_file_seq_start,
+	.next = runlist_file_seq_next,
+	.stop = runlist_file_seq_stop,
+	.show = runlist_file_seq_show,
 };
 
 static int runlist_file_open(struct inode *inode, struct file *f) {
@@ -157,6 +183,7 @@ ssize_t preempt_tsg_file_write(struct file *f, const char __user *buffer,
 	uint32_t target_tsgid;
 	// Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
 	int err = kstrtou32_from_user(buffer, count, 0, &target_tsgid);
+	struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
 	if (err)
 		return err;
 
@@ -165,7 +192,7 @@ ssize_t preempt_tsg_file_write(struct file *f, const char __user *buffer,
 		return -ERANGE;
 
 	// Execute preemption
-	err = preempt_tsg(target_tsgid);
+	err = preempt_tsg(g, target_tsgid);
 	if (err)
 		return err;
 
@@ -181,9 +208,9 @@ ssize_t disable_channel_file_write(struct file *f, const char __user *buffer,
 	uint32_t target_channel;
 	channel_ctrl_t chan;
 	int err;
-	struct gk20a *g = get_live_gk20a();
-	if (!g)
-		return -EIO;
+	runlist_info_t rl_info;
+	runlist_disable_t rl_disable;
+	struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
 	// Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
 	err = kstrtou32_from_user(buffer, count, 0, &target_channel);
 	if (err)
@@ -195,7 +222,16 @@ ssize_t disable_channel_file_write(struct file *f, const char __user *buffer,
 	// Disable channel
 	chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel));
 	chan.enable_clear = true;
+	// disable sched
+	rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST);
+	rl_disable.raw = nvdebug_readl(g, NV_PFIFO_SCHED_DISABLE);
+	rl_disable.raw |= BIT(rl_info.id);
+	nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);
+	// disable chan
 	nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(target_channel), chan.raw);
+	// enable sched
+	rl_disable.raw &= ~BIT(rl_info.id);
+	nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);
 
 	return count;
 }
@@ -209,9 +245,7 @@ ssize_t enable_channel_file_write(struct file *f, const char __user *buffer,
 	uint32_t target_channel;
 	channel_ctrl_t chan;
 	int err;
-	struct gk20a *g = get_live_gk20a();
-	if (!g)
-		return -EIO;
+	struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
 	// Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
 	err = kstrtou32_from_user(buffer, count, 0, &target_channel);
 	if (err)
@@ -235,14 +269,12 @@ const struct file_operations enable_channel_file_ops = {
 ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
 				   size_t count, loff_t *off) {
 	uint32_t target_tsgid;
-	struct runlist_chan* chan;
+	struct gv100_runlist_chan* chan;
 	channel_ctrl_t chan_ctl;
 	struct runlist_iter rl_iter;
 	int err;
 	loff_t pos = 0;
-	struct gk20a *g = get_live_gk20a();
-	if (!g)
-		return -EIO;
+	struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
 	// Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
 	err = kstrtou32_from_user(buffer, count, 0, &target_tsgid);
 	if (err)
@@ -251,32 +283,34 @@ ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
 	if (target_tsgid > MAX_TSGID)
 		return -ERANGE;
 
-	err = get_runlist_iter(&rl_iter);
+	err = get_runlist_iter(g, 0, &rl_iter);
 	if (err)
 		return err;
 
 	// Iterate through all TSGs
 	while (pos < rl_iter.rl_info.len) {
-		if (rl_iter.curr_tsg->tsgid == target_tsgid) {
+		if (tsgid(g, rl_iter.curr_entry) == target_tsgid) {
 			// Enable channels of target TSG
-			for_chan_in_tsg(chan, rl_iter.curr_tsg) {
+			for_chan_in_tsg(g, chan, rl_iter.curr_entry) {
 				chan_ctl.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chan->chid));
 				chan_ctl.enable_set = true;
 				nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(chan->chid), chan_ctl.raw);
 			}
 		} else {
+			// XXX: Fix for bare channels. Maybe a "for_chan_until_tsg" macro?
 			// Disable all other channels
-			for_chan_in_tsg(chan, rl_iter.curr_tsg) {
+			// (This is how the Jetson nvgpu driver disables TSGs)
+			for_chan_in_tsg(g, chan, rl_iter.curr_entry) {
 				chan_ctl.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chan->chid));
 				chan_ctl.enable_clear = true;
 				nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(chan->chid), chan_ctl.raw);
 			}
 		}
-		pos += 1 + rl_iter.curr_tsg->tsg_length;
-		rl_iter.curr_tsg = next_tsg(rl_iter.curr_tsg);
+		pos += 1 + tsg_length(g, rl_iter.curr_entry);
+		rl_iter.curr_entry = next_tsg(g, rl_iter.curr_entry);
 	}
 	// Switch to next TSG with active channels (should be our TSG)
-	err = preempt_tsg(target_tsgid);
+	err = preempt_tsg(g, target_tsgid);
 	if (err)
 		return err;
 
diff --git a/stubs.h b/stubs.h
new file mode 100644
index 0000000..bfcc0d7
--- /dev/null
+++ b/stubs.h
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helpful private functions copied from elsewhere in the kernel tree
+ * DO NOT MODIFY
+ */
+#include <linux/version.h>
+
+// Functions from drivers/pci/pci.h
+/**
+ * pci_match_one_device - Tell if a PCI device structure has a matching
+ *			  PCI device id structure
+ * @id: single PCI device id structure to match
+ * @dev: the PCI device structure to match against
+ *
+ * Returns the matching pci_device_id structure or %NULL if there is no match.
+ */
+static inline const struct pci_device_id *
+pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
+{
+	if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
+	    (id->device == PCI_ANY_ID || id->device == dev->device) &&
+	    (id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) &&
+	    (id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) &&
+	    !((id->class ^ dev->class) & id->class_mask))
+		return id;
+	return NULL;
+}
+
+// Functions from drivers/pci/search.h
+#include <linux/device.h>
+#include <linux/pci.h>
+extern struct bus_type pci_bus_type;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,3,0)
+static int match_pci_dev_by_id(struct device *dev, void *data)
+#else
+static int match_pci_dev_by_id(struct device *dev, const void *data)
+#endif
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	const struct pci_device_id *id = data;
+
+	if (pci_match_one_device(id, pdev))
+		return 1;
+	return 0;
+}
+
+/*
+ * pci_get_dev_by_id - begin or continue searching for a PCI device by id
+ * @id: pointer to struct pci_device_id to match for the device
+ * @from: Previous PCI device found in search, or %NULL for new search.
+ *
+ * Iterates through the list of known PCI devices.  If a PCI device is found
+ * with a matching id a pointer to its device structure is returned, and the
+ * reference count to the device is incremented.  Otherwise, %NULL is returned.
+ * A new search is initiated by passing %NULL as the @from argument.  Otherwise
+ * if @from is not %NULL, searches continue from next device on the global
+ * list.  The reference count for @from is always decremented if it is not
+ * %NULL.
+ *
+ * This is an internal function for use by the other search functions in
+ * this file.
+ */
+static struct pci_dev *pci_get_dev_by_id(const struct pci_device_id *id,
+					 struct pci_dev *from)
+{
+	struct device *dev;
+	struct device *dev_start = NULL;
+	struct pci_dev *pdev = NULL;
+
+	if (from)
+		dev_start = &from->dev;
+	dev = bus_find_device(&pci_bus_type, dev_start, (void *)id,
+			      match_pci_dev_by_id);
+	if (dev)
+		pdev = to_pci_dev(dev);
+	pci_dev_put(from);
+	return pdev;
+}
+ 
-- 
cgit v1.2.2