Snapshot for ECRTS'25 artifact evaluation

author: Joshua Bakita <bakitajoshua@gmail.com> 2025-05-05 03:53:01 -0400
committer: Joshua Bakita <bakitajoshua@gmail.com> 2025-05-05 03:53:13 -0400
commit: 293430fcb5d4013b573556c58457ee706e482b7f (patch)
tree: 9328fa680f55b4e1a08d24714275b8437be3be5d
parent: 494df296bf4abe9b2b484bde1a4fad28c989afec (diff)
9 files changed, 2154 insertions, 46 deletions
diff --git a/Makefile b/Makefile
index fea3819..9d6d374 100644
--- a/Makefile
+++ b/Makefile
@@ -8,3 +8,6 @@ all:
        make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
 clean:
        make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
+nvdebug_user.so: runlist.c mmu.c bus.c nvdebug_user.c
+        gcc $< -shared -o $@ $(KBULID_CFLAGS)
diff --git a/README.md b/README.md
index da3e5d7..2889b29 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,7 @@ Not all these TPCs will necessarially be enabled in every GPC.
 Use `cat gpcX_tpc_mask` to get a bit mask of which TPCs are disabled for GPC X.
 A set bit indicates a disabled TPC.
 This API is only available on enabled GPCs.
+Bits greater than the number of on-chip TPCs per GPC should be ignored (it may appear than non-existent TPCs are "disabled").
 Example usage: To get the number of on-chip SMs on Volta+ GPUs, multiply the return of `cat num_gpcs` with `cat num_tpc_per_gpc` and multiply by 2 (SMs per TPC).
@@ -83,6 +84,13 @@ Use `echo Z > runlistY/switch_to_tsg` to switch the GPU to run only the specifie
 Use `echo Y > resubmit_runlist` to resubmit runlist Y (useful to prompt newer GPUs to pick up on re-enabled channels).
+## Error Interpretation
+First check the kernel log to see if in includes more information about the error.
+The following conventions are used for certain error codes:
+- EIO, "Input/Output Error," is returned when an operation fails due to a bad register read.
+- (Other errors may not have a consistent conventional meaning; see the implementation.)
 ## General Codebase Structure
 - `nvdebug.h` defines and describes all GPU data structures. This does not depend on any kernel-internal headers.
 - `nvdebug_entry.h` contains module startup, device detection, initialization, and module teardown logic.
@@ -94,4 +102,4 @@ Use `echo Y > resubmit_runlist` to resubmit runlist Y (useful to prompt newer GP
 - The runlist-printing API does not work when runlist management is delegated to the GPU System Processor (GSP) (most Turing+ datacenter GPUs).
  To workaround, enable the `FALLBACK_TO_PRAMIN` define in `runlist.c`, or reload the `nvidia` kernel module with the `NVreg_EnableGpuFirmware=0` parameter setting.
-  (Eg. on A100: end all GPU-using processes, then `sudo rmmod nvidia_uvm nvidia; sudo modprobe nvidia NVreg_EnableGpuFirmware=0`.)
+  (Eg. on A100: end all GPU-using processes, then `sudo rmmod nvidia_drm nvidia_modeset nvidia_uvm nvidia; sudo modprobe nvidia NVreg_EnableGpuFirmware=0`.)
diff --git a/device_info_procfs.c b/device_info_procfs.c
index 4e4ab03..105e731 100644
--- a/device_info_procfs.c
+++ b/device_info_procfs.c
@@ -18,7 +18,7 @@ static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size,
                return 0;
        if ((read = nvdebug_readl(g, (uintptr_t)pde_data(file_inode(f)))) == -1)
-                return -EOPNOTSUPP;
+                return -EIO;
        // 32 bit register will always take less than 16 characters to print
        chars_written = scnprintf(out, 16, "%#0x\n", read);
        if (copy_to_user(buf, out, chars_written))
@@ -32,12 +32,85 @@ struct file_operations nvdebug_read_reg32_file_ops = {
        .llseek = default_llseek,
 };
+typedef union {
+        struct {
+                uint8_t partitioning_select:2;
+                uint8_t table_select:2;
+                 uint32_t pad_1:12;
+                uint8_t veid_offset:6;
+                 uint32_t pad_2:2;
+                uint8_t table_offset:6;
+                 uint32_t pad_3:2;
+        };
+        uint32_t raw;
+} partition_ctl_t;
+static ssize_t nvdebug_read_part(struct file *f, char __user *buf, size_t size, loff_t *off) {
+        char out[12*64+2];
+        int i, chars_written = 0;
+        partition_ctl_t part_ctl;
+        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
+        if (size < 16 || *off != 0)
+                return 0;
+        // 32 bit register will always take less than 16 characters to print
+        part_ctl.raw = nvdebug_readl(g, 0x00405b2c);
+        //part_ctl.partitioning_select = 0; // XXX XXX XXX Temp; 06/18/2024
+        //part_ctl.table_select = 3; // 3 == ???
+        //part_ctl.table_select = 2; // 2 == TBL_SEL_PARTITIONING_LMEM_BLK
+        part_ctl.table_select = 1; // 1 == TBL_SEL_PARTITIONING_ENABLE
+        //part_ctl.table_select = 0; // 0 == TBL_SEL_NONE
+        part_ctl.veid_offset = (uintptr_t)pde_data(file_inode(f)); // Range of [0, 0x3f], aka [0, 63]
+        for (i = 0; i < 64; i++) {
+                // Increment to next table offset in PARTITION_CTL
+                part_ctl.table_offset = i;
+                nvdebug_writel(g, 0x00405b2c, part_ctl.raw);
+                // Verify write applied to PARTITION_CTL
+                part_ctl.raw = nvdebug_readl(g, 0x00405b2c);
+                if (part_ctl.table_offset != i)
+                        return -ENOTRECOVERABLE;
+                // Read PARTITION_DATA and print
+                // ---
+                // I get back 0x000000ff on Volta and 0x00000003 on Turing from
+                // PARTITION_DATA for all possible VEID_OFFSET, TBL_OFFSET, and TBL_SEL
+                // combinations.
+                // ---
+                // There's a 48-byte (12-word) gap after the address for PARTITION_DATA.
+                // Exploring this on Turing for TBL_SEL_PARTITIONING_ENABLE, VEID 1, 62, and
+                // 63, with CUDA_MPS_ACTIVE_THREAD_PERCENTAGE=5 for constant_cycles_kernel
+                // running under MPS:
+                // +0x0: 0x3
+                // +0x4: 0
+                // +0x8: 0x100
+                // +0xC: 0
+                // +0x10: 0xffffffff
+                // +0x14: 0
+                // +0x18: 0
+                // +0x1C: 0xffffffff
+                // +0x20: 0
+                // +0x24: 0xffffffff
+                // +0x28: 0xffffffff
+                // +0x2C: 0xffffffff
+                chars_written += scnprintf(out + chars_written, 12, "%#010x ", nvdebug_readl(g, 0x00405b30));
+        }
+        chars_written += scnprintf(out + chars_written, 2, "\n");
+        if (copy_to_user(buf, out, chars_written))
+                printk(KERN_WARNING "Unable to copy all data for %s\n", file_dentry(f)->d_name.name);
+        *off += chars_written;
+        return chars_written;
+}
+struct file_operations nvdebug_read_part_file_ops = {
+        .read = nvdebug_read_part,
+        .llseek = default_llseek,
+};
 static ssize_t nvdebug_reg_range_read(struct file *f, char __user *buf, size_t size, loff_t *off) {
        char out[12];
        int chars_written;
        uint32_t read, mask;
        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
-        // See comment in nvdebug_entry.c to understand `union reg_range`
+        // `start_bit` is included, `stop_bit` is not, so to print lower eight bits
+        // from a register, use `start_bit = 0` and `stop_bit = 8`.
        union reg_range range;
        range.raw = (uintptr_t)pde_data(file_inode(f));
@@ -47,7 +120,7 @@ static ssize_t nvdebug_reg_range_read(struct file *f, char __user *buf, size_t s
        // Print bits `start_bit` to `stop_bit` from 32 bits at address `offset`
        if ((read = nvdebug_readl(g, range.offset)) == -1)
-                return -EOPNOTSUPP;
+                return -EIO;
        // Setup `mask` used to throw out unused upper bits
        mask = -1u >> (32 - range.stop_bit + range.start_bit);
        // Throw out unused lower bits via a shift, apply the mask, and print
diff --git a/mmu.c b/mmu.c
index ababef5..e2b9a91 100644
--- a/mmu.c
+++ b/mmu.c
@@ -1,9 +1,13 @@
 /* Copyright 2024 Joshua Bakita
 * Helpers to deal with NVIDIA's MMU and associated page tables
 */
+#include <linux/dma-mapping.h>  // dma_map_page() and dma_unmap_page()
 #include <linux/err.h>  // ERR_PTR() etc.
+#include <linux/gfp.h>  // alloc_pages()
 #include <linux/iommu.h>  // iommu_get_domain_for_dev() and iommu_iova_to_phys()
 #include <linux/kernel.h>  // Kernel types
+#include <linux/list.h>  // struct list_head and associated functions
+#include <linux/mm.h>  // put_page()
 #include "nvdebug.h"
@@ -15,6 +19,11 @@ int g_verbose = 0;
 #define printk_debug if (g_verbose >= 2) printk
 #define printk_info  if (g_verbose >= 1) printk
+// At least map_page_directory() assumes that pages are 4 KiB
+#if PAGE_SIZE != 4096
+#error nvdebug assumes and requires a 4 KiB page size.
+#endif
 /* Convert a page directory (PD) pointer and aperture to be kernel-accessible
  I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the
@@ -22,7 +31,8 @@ int g_verbose = 0;
  @param addr  Pointer from page directory entry (PDE)
  @param pd_ap PD-type aperture (target address space) for `addr`
-  @return A dereferencable kernel address, or an ERR_PTR-wrapped error
+  @return A dereferencable kernel address, 0 if an I/O MMU is in use and has
+          no available mapping for the bus address, or an ERR_PTR-wrapped error
 */
 static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr,
                              enum PD_TARGET pd_ap) {
@@ -56,7 +66,7 @@ static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr,
        // Check for, and translate through, the I/O MMU (if any)
        if ((dom = iommu_get_domain_for_dev(g->dev))) {
                phys = iommu_iova_to_phys(dom, addr);
-                printk_debug(KERN_DEBUG "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#lx to physical address %#llx.\n", addr, phys);
+                printk_debug(KERN_DEBUG "[nvdebug] %s: I/O MMU translated SYS_MEM I/O VA %#lx to physical address %#llx.\n", __func__, addr, phys);
        } else
                phys = addr;
@@ -143,6 +153,327 @@ uint64_t search_page_directory(struct nvdebug_state *g,
        return 0;
 }
+/* GPU Virtual address -> Physical address ("forward" translation) for V2 tables
+  Index the page directories and tables used by the GPU MMU to determine which
+  physical address a given GPU virtual address has been mapped to.
+  The page directory and tables may be located in VID_MEM, SYS_MEM, or spread
+  across multiple apertures.
+  @param pd_config      Page Directory configuration, containing pointer and
+                        aperture for the start of the PDE3 entries
+  @param addr_to_find   Virtual address to translate to a physical address
+  @param found_addr     Where to store found physical address (0 if unfound)
+  @param found_aperture Where to store aperture of found physical address
+  @return 0 on success, -ENXIO if not found, and -errno on error.
+*/
+int translate_page_directory(struct nvdebug_state *g,
+                             page_dir_config_t pd_config,
+                             uint64_t addr_to_find,
+                             uint64_t *found_addr /* out */,
+                             enum INST_TARGET *found_aperture /* out */) {
+        page_dir_entry_t entry;
+        void __iomem *next_kva;
+        unsigned int level, pde_idx;
+        uintptr_t next = (uintptr_t)pd_config.page_dir << 12;
+        enum PD_TARGET next_target = INST2PD_TARGET(pd_config.target);
+        *found_addr = 0;
+        *found_aperture = TARGET_INVALID;
+        // Make sure that the query is page-aligned (likely mistake otherwise)
+        if (addr_to_find & 0xfff) {
+                printk(KERN_WARNING "[nvdebug] Attempting to translate unaligned address %#llx in translate_page_directory()!\n", addr_to_find);
+                return -EINVAL;
+        }
+        printk_info(KERN_INFO "[nvdebug] Translating addr %#018llx in V2 page table with base %#018llx\n", (u64)addr_to_find, (u64)next);
+        // Step through each PDE level and the PTE level
+        for (level = 0; level < 5; level++) {
+                // Index into this level
+                pde_idx = (addr_to_find >> NV_MMU_PT_V2_LSB[level]) & (NV_MMU_PT_V2_SZ[level] - 1);
+                printk_debug(KERN_DEBUG "[nvdebug] Using index %u in lvl %d\n", pde_idx, level);
+                // Hack to workaround PDE0 being double-size and strangely formatted
+                if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
+                        next += 8;
+                // Obtain a kernel-dereferencable address
+                next_kva = pd_deref(g, next, next_target);
+                if (IS_ERR_OR_NULL(next_kva)) {
+                        printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, next, pd_target_to_text(next_target), PTR_ERR(next_kva));
+                        return PTR_ERR(next_kva);
+                }
+                // Obtain entry at this level
+                entry.raw_w = readq(next_kva + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx);
+                if (entry.target == PD_AND_TARGET_INVALID)
+                        return -ENXIO;
+                printk_debug(KERN_DEBUG "[nvdebug] Found %s pointing to %#018llx in ap '%s' at lvl %d (raw: %#018llx)\n", entry.is_pte ? "PTE" : "PDE", ((u64)entry.addr) << 12, pd_target_to_text(entry.target), level, entry.raw_w);
+                // Just return the physical address if this is the PTE level
+                if (entry.is_pte) { // level == 4 for 4 KiB pages, == 3 for 2 MiB
+                        *found_addr = ((uint64_t)entry.addr) << 12;
+                        *found_aperture = entry.aperture;
+                        return 0;
+                }
+                // Otherwise step to the next table level
+                // TODO: Use addr_w as appropriate
+                next = (uint64_t)entry.addr << 12;
+                next_target = entry.target;
+        }
+        return 0;
+}
+// This struct is very special. We will never directly allocate this struct;
+// its sole purpose is to provide more intuitive names to the offsets at which
+// we store data in Linux's struct page. Such (ab)use of struct page is
+// explictly permitted (see linux/mm_types.h). This struct is thus used by
+// casting a pointer of struct page to a pointer of struct nvdebug_pd_page,
+// then accessing the associated fields. This pointer may also be freely cast
+// back to a sturct page pointer.
+// We have 24 (32-bit) or 44 (64-bit) bytes available in the page struct
+// (according to the documentation on struct page). Our comments indicate what
+// available parts of struct page we repurpose for our own needs.
+struct nvdebug_pd_page {
+        unsigned long __flags; // From struct page; do not touch!
+        // Overlaps struct page.lru
+        struct list_head list; // 4/8 bytes
+        // Overlaps struct page.mapping (and page.share on 32-bit)
+        uintptr_t parent_addr; // 8 bytes
+        // Overlaps struct page.share (page.private on 32-bit)
+        enum PD_TARGET parent_aperture; // 4 bytes
+        // Overlaps page.private (page.page_type on 32-bit)
+        dma_addr_t dma_addr; // 4/8 bytes
+};
+/* Collect and free any now-unused page directory/table allocations
+  @param force Deallocate all page directories/tables created by this module,
+               no matter if they appear to be in-use or not.
+  @returns Number of freed pages on success, -errno on error.
+*/
+int gc_page_directory(struct nvdebug_state *g, bool force) {
+        struct nvdebug_pd_page  *page, *_page;
+        void __iomem *parent_kva;
+        page_dir_entry_t parent_entry;
+        int freed_pages = 0;
+        // Depth-first traversal (from perspective of each page table) of page
+        // allocations.
+        // (This is depth-first because map_page_directory() always allocates and
+        // pushes page directory allocations before page table allocations.)
+        list_for_each_entry_safe_reverse(page, _page, &g->pd_allocs, list) {
+                printk_debug(KERN_DEBUG "[nvdebug] %s: Checking if page directory/table at %llx (SYS_MEM_?) with parent at %lx (%s) is unused...\n", __func__, page->dma_addr, page->parent_addr, pd_target_to_text(page->parent_aperture));
+                // Try to determine if we're still in-use. We consider ourselves
+                // potentially in-use if our parent still points to us.
+                parent_kva = pd_deref(g, page->parent_addr, page->parent_aperture);
+                if (IS_ERR(parent_kva)) {
+                        printk(KERN_ERR "[nvdebug] %s: Error resolving %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, page->parent_addr, pd_target_to_text(page->parent_aperture), PTR_ERR(parent_kva));
+                        return -ENOTRECOVERABLE;
+                }
+                // A NULL kva indicates parent no longer exists
+                parent_entry.raw_w = parent_kva ? readq(parent_kva) : 0;
+                // Page directory/table still in-use; do not free unless forced
+                if (parent_entry.addr_w == (page->dma_addr >> 12) && !force)
+                        continue;
+                // Free this page table/directory and delete our parent's pointer to us
+                if (parent_entry.addr_w == (page->dma_addr >> 12)) {
+                        printk(KERN_WARNING "[nvdebug] %s: Deleting page table/directory at %llx (SYS_MEM_?) with parent at %lx (%s) that may still be in-use!\n", __func__, page->dma_addr, page->parent_addr, pd_target_to_text(page->parent_aperture));
+                        writeq(0, parent_kva);
+                }
+                // Unmap, zero, free, and remove from tracking (these all return void)
+                dma_unmap_page(g->dev, page->dma_addr, PAGE_SIZE, DMA_TO_DEVICE);
+                memset(page_to_virt((struct page*)page), 0, PAGE_SIZE);
+                // Necessary to reset mapcount as we (ab)use its state for other things
+                page_mapcount_reset((struct page*)page);
+                // Same reset needed for mapping
+                ((struct page*)page)->mapping = NULL;
+                // Remove this page from our list of allocated pages
+                list_del(&page->list);
+                // Free the page
+                put_page((struct page*)page);
+                freed_pages++;
+        }
+        printk_debug(KERN_DEBUG "[nvdebug] %s: Freed %d pages.", __func__, freed_pages);
+        return freed_pages;
+}
+/* Map a GPU virtual address to a physical address in a GPU page table
+  Search for a mapping for specified GPU virtual address, and create a new one
+  if none is found. Automatically creates page directories and page table
+  entries as necessary.
+  The page directory and tables may be located in VID_MEM, SYS_MEM, or spread
+  across multiple apertures.
+  @param pd_config      Page Directory configuration, containing pointer and
+                        aperture for the start of the PDE3 entries
+  @param vaddr_to_find  Virtual address to check, and map to a physical address
+                        if nothing is already mapped (up to 49 bits long)
+  @param paddr_to_map   Physical address to use (up to 36 bits long if VID_MEM,
+                        and up to 58 bits if SYS_MEM)
+  @param paddr_target   Which space does the physical address refer to?
+  @param huge_page      Set to map a 2 MiB, rather than 4 KiB, page
+  @return 0 on success, 1 if mapping already exists, -EADDRINUSE if virtual
+          address is already mapped to something else, and -errno on error
+*/
+int map_page_directory(struct nvdebug_state *g,
+                       page_dir_config_t pd_config,
+                       uint64_t vaddr_to_find,
+                       uint64_t paddr_to_map,
+                       enum INST_TARGET paddr_target,
+                       bool huge_page) {
+        page_dir_entry_t entry;
+        void __iomem *next_kva;
+        unsigned int level, pde_idx;
+        uintptr_t next = (uintptr_t)pd_config.page_dir << 12;
+        enum PD_TARGET next_target = INST2PD_TARGET(pd_config.target);
+        // Make sure that the query is page-aligned (likely mistake otherwise)
+        if ((vaddr_to_find & 0xfff || paddr_to_map & 0xfff)
+            || (huge_page && (vaddr_to_find & 0x1fffff || paddr_to_map & 0x1fffff))) {
+                printk(KERN_WARNING "[nvdebug] %s: Attempting to map an unaligned address (physical %#018llx or virtual %#018llx)! Failing...\n", __func__, paddr_to_map, vaddr_to_find);
+                return -EINVAL;
+        }
+        // NVIDIA supports up to 49-bit virtual addresss
+        // Except Jetson Xavier only seems to be able to resolve 47-bit addresses?
+        if (vaddr_to_find >> 49) {
+                printk(KERN_WARNING "[nvdebug] %s: vaddr_to_find (%#018llx) is beyond the 49-bit virtual address space supported by the GPU! Failing...\n", __func__, vaddr_to_find);
+                return -EINVAL;
+        }
+        // NVIDIA supports up to 36-bit VID_MEM addresses
+        if (paddr_target == TARGET_VID_MEM && paddr_to_map >> 36) {
+                printk(KERN_WARNING "[nvdebug] %s: paddr_to_map (%#018llx) is beyond the 36-bit VID_MEM address space! Failing...\n", __func__, paddr_to_map);
+                return -EINVAL;
+        }
+        // NVIDIA supports up to 58-bit SYS_MEM addresses
+        if ((paddr_target == TARGET_SYS_MEM_COHERENT ||
+             paddr_target == TARGET_SYS_MEM_NONCOHERENT) && paddr_to_map >> 58) {
+                printk(KERN_WARNING "[nvdebug] %s: paddr_to_map (%#018llx) is beyond the 58-bit SYS_MEM address space! Failing...\n", __func__, paddr_to_map);
+                return -EINVAL;
+        }
+        // We don't support mapping to PEERs; that requires a PEER ID
+        if (paddr_target == TARGET_PEER) {
+                printk(KERN_WARNING "[nvdebug] %s: paddr_target must be SYS_MEM_* or VID_MEM! Failing...\n", __func__);
+                return -EINVAL;
+        }
+        printk_info(KERN_INFO "[nvdebug] Mapping addr %#018llx in page table with base %#018llx to %s address %#018llx\n", vaddr_to_find, (u64)next, target_to_text(paddr_target), paddr_to_map);
+        // Step through each PDE level and the PTE level
+        for (level = 0; level < 5; level++) {
+                // Index into this level
+                pde_idx = (vaddr_to_find >> NV_MMU_PT_V2_LSB[level]) & (NV_MMU_PT_V2_SZ[level] - 1);
+                printk_debug(KERN_DEBUG "[nvdebug] In table at KVA %#lx, using index %u in lvl %d\n", (uintptr_t)next, pde_idx, level);
+                // Hack to workaround PDE0 being double-size and strangely formatted
+                if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
+                        next += 8;
+                // Obtain a kernel-dereferencable address
+                next_kva = pd_deref(g, next, next_target);
+                if (IS_ERR_OR_NULL(next_kva)) {
+                        printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, next, pd_target_to_text(next_target), PTR_ERR(next_kva));
+                        return -ENOTRECOVERABLE;
+                }
+                // Obtain entry at this level
+                entry.raw_w = readq(next_kva + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx);
+                // If pointer to next level of the table does not exist
+                if (entry.target == PD_AND_TARGET_INVALID) { // PTE or PD covered by PD_AND_TARGET_INVALID
+                        if (level == 4 || (huge_page && level == 3)) {
+                                // Create new PTE (allocation, as needed, is handled at level 2 or 3)
+                                // Targets observed in page tables:
+                                // For PCIe: entry.target == PTE_AND_TARGET_VID_MEM;
+                                // For Jetson: entry.target == PTE_AND_TARGET_SYS_MEM_NONCOHERENT;
+                                entry.is_pte = 1;
+                                entry.aperture = paddr_target;
+                                if (paddr_target == TARGET_VID_MEM)
+                                        entry.addr = paddr_to_map >> 12;
+                                else
+                                        entry.addr_w = paddr_to_map >> 12;
+                                // Set the volatile bit (as NVRM does for SYS_MEM_COHERENT mappings)
+                                // (This does nothing if the target is VID_MEM, but if the target is
+                                // SYS_MEM_*, accesses will bypass the L2.)
+                                entry.is_volatile = 1;
+                                // Leave other fields zero, yielding an unencrypted, unprivileged, r/w,
+                                // volatile mapping with atomics enabled.
+                                // XXX: Hack to work around PDE0 double-size weirdness. Huge
+                                //      page mapping will fault without this.
+                                if (level == 3)
+                                        writeq(entry.raw_w, next_kva - 8 + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx);
+                        } else {
+                                struct page* page_dir;
+                                struct nvdebug_pd_page* page_dir_reinterpret;
+                                dma_addr_t page_dir_dma;
+                                // Allocate one 4 KiB all-zero (all invalid) page directory/
+                                // table at the next level
+                                if (!(page_dir = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0)))
+                                        return -ENOMEM;
+                                // Obtain a GPU-accessible/bus address for this page (handling
+                                // I/O MMU mappings, etc.)
+                                page_dir_dma = dma_map_page(g->dev, page_dir, 0, PAGE_SIZE, DMA_TO_DEVICE);
+                                // Verify that we were able to create a mapping
+                                if (dma_mapping_error(g->dev, page_dir_dma))
+                                        return dma_mapping_error(g->dev, page_dir_dma);
+                                // Record this allocation for freeing later
+                                // Note: Linux maintains a page struct for every page in the
+                                //       system. This struct has available space that drivers
+                                //       can use to store their own tracking information. Our
+                                //       struct nvdebug_pd_page facilitates this.
+                                page_dir_reinterpret = (struct nvdebug_pd_page*)page_dir;
+                                page_dir_reinterpret->parent_addr = next + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx;
+                                page_dir_reinterpret->parent_aperture = next_target;
+                                page_dir_reinterpret->dma_addr = page_dir_dma;
+                                list_add(&page_dir_reinterpret->list, &g->pd_allocs);
+                                // Point this entry to the new directory/table
+                                entry.target = PD_AND_TARGET_SYS_MEM_COHERENT; // Observed in page tables
+                                // Must use addr_w with SYS_MEM targets
+                                entry.addr_w = page_dir_dma >> 12;
+                                // On Jetson and NVRM, all PDEs are marked volatile
+                                entry.is_volatile = 1;
+                                // We don't configure ATS, so disable ATS lookups for speed.
+                                entry.no_ats = 1;
+                        }
+                        writeq(entry.raw_w, next_kva + NV_MMU_PT_V2_ENTRY_SZ[level] * pde_idx);
+                        printk_debug(KERN_DEBUG "[nvdebug] Created %s pointing to %llx in ap '%s' at lvl %d (raw: %#018llx)\n", entry.is_pte ? "PTE" : "PDE", ((u64)entry.addr) << 12, pd_target_to_text(entry.target), level, entry.raw_w);
+                        // Successfully created the requested PTE, so return
+                        if (entry.is_pte)
+                                return 0;
+                } else {
+                        printk_debug(KERN_DEBUG "[nvdebug] Found %s pointing to %llx in ap '%s' at lvl %d (raw: %#018llx)\n", entry.is_pte ? "PTE" : "PDE", ((u64)entry.addr) << 12, pd_target_to_text(entry.target), level, entry.raw_w);
+                }
+                // If this is the PTE level, return success if the address and target are correct
+                if (entry.is_pte) { // level == 4 for 4 KiB pages, == 3 for 2 MiB
+                        if (entry.aperture != paddr_target)
+                                return -EADDRINUSE; // Also handles PEER
+                        if (entry.aperture == TARGET_VID_MEM)
+                                return (uint64_t)entry.addr == paddr_to_map >> 12 ? 1 : -EADDRINUSE;
+                        else
+                                return entry.addr_w == paddr_to_map >> 12 ? 1 : -EADDRINUSE; // SYS_MEM is wider
+                }
+                // If mapping a 2 MiB page and we made it here, level 3 had a PDE. This
+                // means that the requested 2 MiB virtual region already has one or more
+                // small pages mapped within it---a.k.a., the addresses are in use.
+                // If we didn't bail out here, the above logic would attempt to fallback
+                // to a 4 KiB mapping, which would be unexpected behavior.
+                if (huge_page && level == 3)
+                        return -EADDRINUSE;
+                // Otherwise step to the next table level
+                if (entry.aperture == TARGET_VID_MEM)
+                        next = (uint64_t)entry.addr << 12;
+                else
+                        next = (uint64_t)entry.addr_w << 12; // SYS_MEM is wider
+                next_target = entry.target;
+        }
+        return -ENOTRECOVERABLE; // Should be impossible
+}
 /* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables
  (See `search_page_directory()` for documentation.)
 */
@@ -187,7 +518,7 @@ uint64_t search_v1_page_directory(struct nvdebug_state *g,
                // Verify PDE is present
                if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
                        continue;
-//              printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw);
+                // TODO: Handle huge pages
                printk_debug(KERN_DEBUG "[nvdebug] Found %s PDE at index %lld pointing to PTEs @ %#018llx in ap '%d' (raw: %#018llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", i, ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw);
                // For each PTE
                for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
@@ -215,7 +546,84 @@ uint64_t search_v1_page_directory(struct nvdebug_state *g,
        return 0;
 }
+/* GPU Virtual address -> Physical address ("forward" translation) for V1 tables
+  (See `translate_page_directory()` for documentation.)
+*/
+int translate_v1_page_directory(struct nvdebug_state *g,
+                                page_dir_config_t pd_config,
+                                uint64_t addr_to_find,
+                                uint64_t *found_addr /* out */,
+                                enum INST_TARGET *found_aperture /* out */) {
+        page_dir_entry_v1_t pde;
+        page_tbl_entry_v1_t pte;
+        uintptr_t pde_idx, pde_phys, pte_idx, pte_phys;
+        void __iomem *pte_kva, *pde_kva;
+        *found_addr = 0;
+        *found_aperture = TARGET_INVALID;
+        // Make sure that the query is page-aligned (likely mistake otherwise)
+        if (addr_to_find & 0xfff) {
+                printk(KERN_WARNING "[nvdebug] Attempting to translate unaligned address %#llx in translate_v1_page_directory()!\n", addr_to_find);
+                return -EINVAL;
+        }
+        // This function only understands the Page Table Version 1 format
+        if (pd_config.is_ver2) {
+                printk(KERN_ERR "[nvdebug] Passed a Version 2 page table at %#018llx to translate_v1_page_directory()!\n", (uint64_t)pd_config.page_dir << 12);
+                return -EINVAL;
+        }
+        // We only understand the Version 1 format when 128 KiB huge pages are in-use
+        if (pd_config.is_64k_big_page) {
+                printk(KERN_ERR "[nvdebug] Page Table Version 1 with 64 KiB huge pages is unsupported!\n");
+                return -EINVAL;
+        }
+        printk_info(KERN_INFO "[nvdebug] Translating addr %#018llx in V1 page table with base %#018llx\n", (uint64_t)addr_to_find, (uint64_t)pd_config.page_dir << 12);
+        // Shift bits which define PDE index to start at bit 0, and mask other bits
+        pde_idx = (addr_to_find >> NV_MMU_PT_V1_LSB[0]) & (NV_MMU_PT_V1_SZ[0] - 1);
+        // Compute VID_MEM/SYS_MEM address of page directory entry
+        pde_phys = ((uint64_t)pd_config.page_dir << 12) + pde_idx * sizeof(page_dir_entry_v1_t);
+        // Convert VID_MEM/SYS_MEM address to Kernel-accessible Virtual Address (KVA)
+        pde_kva = pd_deref(g, pde_phys, INST2PD_TARGET(pd_config.target));
+        if (IS_ERR_OR_NULL(pde_kva)) {
+                printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_phys, target_to_text(pd_config.target), PTR_ERR(pde_kva));
+                return PTR_ERR(pde_kva);
+        }
+        // Read page directory entry (readq seems to work fine; tested on GM204)
+        pde.raw = readq(pde_kva);
+        // Verify this PDE points to an array of page table entries
+        if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
+                return -ENXIO;
+        // TODO: Check for and handle huge pages
+        printk_debug(KERN_DEBUG "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw);
+        // Shift bits which define PTE index to start at bit 0, and mask other bits
+        pte_idx = (addr_to_find >> NV_MMU_PT_V1_LSB[1]) & (NV_MMU_PT_V1_SZ[1] - 1);
+        // Compute VID_MEM/SYS_MEM address of page table entry
+        pte_phys = ((uint64_t)pde.alt_addr << 12) + pte_idx * sizeof(page_tbl_entry_v1_t);
+        // Convert VID_MEM/SYS_MEM address to Kernel-accessible Virtual Address (KVA)
+        pte_kva = pd_deref(g, pte_phys, V12PD_TARGET(pde.alt_target));
+        if (IS_ERR_OR_NULL(pde_kva)) {
+                printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pte_phys, pd_target_to_text(pde.alt_target), PTR_ERR(pte_kva));
+                return PTR_ERR(pte_kva);
+        }
+        // Read page table entry
+        pte.raw = readq(pte_kva);
+        // XXX: The above readq() is bogus on gk104 (returns -1). Potential issue of pd_deref's move of PRAMIN racing with the driver?
+        if (!pte.is_present)
+                return -ENXIO;
+        printk_debug(KERN_DEBUG "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)pte.addr) << 12, target_to_text(pte.target), pte.is_volatile, pte.is_privileged, pte.is_readonly, pte.atomics_disabled, pte.raw);
+        // Access PTE and return physical address
+        *found_addr = (uint64_t)pte.addr << 12;
+        *found_aperture = pte.target;
+        return 0;
+}
 /* *** UNTESTED ***
+// This is only relevant on pre-Kepler GPUs; not a current priority
 #define NV_MMU_PT_V0_SZ 2048
 #define NV_MMU_PT_V0_LSB 29
 uint64_t search_v0_page_directory(struct nvdebug_state *g,
diff --git a/nvdebug.h b/nvdebug.h
index ca0f514..3ac8db4 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -2,6 +2,7 @@
 * SPDX-License-Identifier: MIT
 *
 * File outline:
+ * - Configuration options
 * - Runlist, preemption, and channel control (FIFO)
 * - Basic GPU information (MC)
 * - Detailed GPU information (PTOP, FUSE, and CE)
@@ -20,6 +21,27 @@
 // this, so declare as incomplete type to avoid pulling in the nvgpu headers.
 struct gk20a;
+// Uncomment to, upon BAR2 access failure, return a PRAMIN-based runlist pointer
+// in get_runlist_iter(). In order for this pointer to remain valid, PRAMIN
+// **must** not be moved during runlist traversal.
+// - The Jetson TX2 has no BAR2, and stores the runlist in VID_MEM, so this
+//   must be enabled to print the runlist on the TX2.
+// - On the A100 in Google Cloud and H100 in Paperspace, as of Aug 2024, this is
+//   needed, as nvdebug is not finding (at least) runlist0 mapped in BAR2/3.
+// Automatically disables printing Instance Block and Context State while
+// traversing the runlist, as these require conflicting uses of PRAMIN (it's
+// needed to search the page tables for the Instance Block in BAR2/3, and to
+// access anything in the Context State---aka CTXSW).
+#define FALLBACK_TO_PRAMIN
+// Starting offset for registers in the corresponding named range
+// Programmable First-In First-Out unit; also known as "Host"
+#define NV_PFIFO 0x00002000 // 8 KiB long; ends prior to 0x00004000
+// Programmable Channel Control System RAM
+#define NV_PCCSR 0x00800000 // 16 KiB long; ends prior to 0x00810000
+// Programmable TOPology registers
+#define NV_PTOP 0x00022400 // 1 KiB long; ends prior to 0x00022800
 /* Runlist Channel
  A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
  of GPU commands. These commands are typically queued from userspace.
@@ -202,7 +224,7 @@ typedef union {
  Support: Ampere, Hopper, Ada, [newer untested]
 */
 #define NV_RUNLIST_PREEMPT_GA100 0x098
-#define PREEMPT_TYPE_RUNLIST 0
+#define PREEMPT_TYPE_RUNLIST PREEMPT_TYPE_CHANNEL
 /*
  "Initiate a preempt of the engine by writing the bit associated with its
@@ -355,6 +377,14 @@ typedef union {
        uint64_t raw;
 } runlist_base_tu102_t;
+/*
+  LEN                   : Read/Write
+  OFFSET                : Read/Write
+  PREEMPTED_TSGID       : Read-only
+  VALID_PREEMPTED_TSGID : Read-only
+  IS_PENDING            : Read-only
+  PREEMPTED_OFFSET      : Read-only
+*/
 typedef union {
        struct {
                uint16_t len:16;
@@ -416,6 +446,27 @@ typedef union {
        uint32_t raw;
 } runlist_channel_config_t;
+/* Context Switch Timeout Configuration
+  After a task's budget expires, there's a configurable grace period, a
+  "timeout", within which the context needs to complete. After this timeout
+  expires, an interrupt is raised to terminate the task.
+  This register configures if such a timeout is enabled and how long the
+  timeout is (the "period").
+  Support: Volta, Turing
+*/
+#define NV_PFIFO_ENG_CTXSW_TIMEOUT 0x00002A0C
+// Support: Ampere
+#define NV_RUNLIST_ENGINE_CTXSW_TIMEOUT_CONFIG(i) (0x220+(i)*64)
+typedef union {
+        struct {
+                uint32_t period:31;
+                bool enabled:1;
+        } __attribute__((packed));
+        uint32_t raw;
+} ctxsw_timeout_t;
 /* Programmable Channel Control System RAM (PCCSR)
  512-entry array of channel control and status data structures.
@@ -477,8 +528,15 @@ typedef union {
                bool busy:1;
                 uint32_t :3;
        } __attribute__((packed));
+        struct {
+                uint32_t word1;
+                uint32_t word2;
+        } __attribute__((packed));
        uint64_t raw;
-} channel_ctrl_t;
+} channel_ctrl_gf100_t;
+// TODO: Remove use of deprecated type name
+typedef channel_ctrl_gf100_t channel_ctrl_t;
 /* CHannel RAM (CHRAM) (PCCSR replacement on Ampere+)
  Starting with Ampere, channel IDs are no longer unique indexes into the
@@ -543,6 +601,8 @@ typedef union {
  Support: Fermi, Kepler, Maxwell, Pascal, Volta, Turing
 */
 #define NV_PFIFO_SCHED_DISABLE 0x00002630
+// Support: Ampere
+#define NV_RUNLIST_SCHED_DISABLE 0x094
 typedef union {
        struct {
                bool runlist_0:1;
@@ -1018,7 +1078,7 @@ typedef union {
        struct {
                uint32_t ptr:28;
                enum INST_TARGET target:2;
-                 uint32_t :1;
+                 uint32_t :1; // disable_cya_debug for BAR2
                bool is_virtual:1;
        } __attribute__((packed));
        uint32_t raw;
@@ -1091,6 +1151,9 @@ typedef union {
  Support: Tesla 2.0* through Ampere, Ada
   *FAULT_REPLAY_* fields are Pascal+ only
  See also: dev_ram.h (open-gpu-kernel-modules) or dev_ram.ref.txt (open-gpu-doc)
+  It appears that on Hopper, IS_VER2 continues to mean IS_VER2, but if unset, the
+  alternative is VER3.
 */
 #define NV_PRAMIN_PDB_CONFIG_OFF 0x200
 typedef union {
@@ -1101,7 +1164,7 @@ typedef union {
                bool fault_replay_tex:1;
                bool fault_replay_gcc:1;
                 uint32_t :4;
-                bool is_ver2:1;
+                bool is_ver2:1; // XXX: Not on Hopper. May be set or not for same page_dir.
                bool is_64k_big_page:1;  // 128Kb otherwise
                uint32_t page_dir_lo:20;
                uint32_t page_dir_hi:32;
@@ -1421,6 +1484,182 @@ typedef union {
 } page_tbl_entry_v0_t;
 */
+/* Fifo Context RAM (RAMFC) and channel INstance RAM (RAMIN)
+  Each channel is configured with a 4 KiB instance block. The prefix of this
+  block is referred to as RAMFC and stores channel-specific state for the Host
+  (aka PFIFO).
+  "A GPU instance block is a block of memory that contains the state
+  for a GPU context.  A GPU context's instance block consists of Host state,
+  pointers to each engine's state, and memory management state.  A GPU instance
+  block also contains a pointer to a block of memory that contains that part of a
+  GPU context's state that a user-level driver may access.  A GPU instance block
+  fits within a single 4K-byte page of memory."
+  "The NV_RAMFC part of a GPU-instance block contains Host's part of a virtual
+  GPU's state.  Host is referred to as "FIFO". "FC" stands for FIFO Context.
+  When Host switches from serving one GPU context to serving a second, Host saves
+  state for the first GPU context to the first GPU context's RAMFC area, and loads
+  state for the second GPU context from the second GPU context's RAMFC area."
+  "Every Host word entry in RAMFC directly corresponds to a PRI-accessible
+  register.  For a description of the contents of a RAMFC entry, please see the
+  description of the corresponding register in "manuals/dev_pbdma.ref".  The
+  offsets of the fields within each entry in RAMFC match those of the
+  corresponding register in the associated PBDMA unit's PRI space."
+  In summary, RAMFC includes details such as the head and tail of the pushbuffer,
+  and RAMIN includes details such as the page table configuration(s).
+  The instance-global page table (as defined in the PDB field) is only used for
+  GPU engines which do not support subcontexts (non-VEID engines).
+  **Not all documented fields are currently populated below.**
+  Support: *Kepler, *Maxwell, *Pascal, Volta, Turing, Ampere, [newer untested]
+  *Pre-Volta GPUs do not support subcontexts.
+  See also: dev_ram.ref.txt and dev_pbdma.ref.txt in NVIDIA's open-gpu-doc
+*/
+// 16-byte (128-bit) substructure defining a subcontext configuration
+typedef struct {
+        page_dir_config_t pdb;
+        uint32_t pasid:20; // Process Address Space ID (PASID) used for ATS
+         uint32_t :11;
+        bool enable_ats:1; // Enable Address Translation Services (ATS)?
+         uint32_t pad;
+} __attribute__((packed)) subcontext_ctrl_t;
+typedef struct {
+// Start RAMFC (512 bytes)
+         uint32_t pad[43];
+        uint32_t fc_target:5; // NV_RAMFC_TARGET; off 43
+         uint32_t :27;
+         uint32_t pad2[17];
+        uint32_t fc_config_l2:1; // NV_RAMFC_CONFIG; off 61
+         uint32_t :3;
+        uint32_t fc_config_ce_split:1;
+        uint32_t fc_config_ce_no_throttle:1;
+         uint32_t :2;
+        uint32_t fc_config_is_priv:1; // ...AUTH_LEVEL
+         uint32_t :3;
+        uint32_t fc_config_userd_writeback:1; // ...USERD_WRITEBACK
+         uint32_t :19;
+         uint32_t pad3[1];
+        uint32_t fc_chan_info_scg:1; // ...SET_CHANNEL_INFO_SCG_TYPE
+         uint32_t :7;
+        uint32_t fc_chan_info_veid:6; // ...SET_CHANNEL_INFO_VEID
+        uint32_t fc_chan_info_chid:12; // ...SET_CHANNEL_INFO_CHID
+         uint32_t :6;
+         uint32_t pad4[64];
+// End RAMFC
+// Start RAMIN
+        page_dir_config_t pdb;
+         uint32_t pad5[2];
+        // WFI_TARGET appears to be ignored if WFI_IS_VIRTUAL
+        uint32_t engine_wfi_target:2; // NV_RAMIN_ENGINE_WFI_TARGET; off 132
+        uint32_t engine_wfi_is_virtual:1;
+         uint32_t :9;
+        // WFI_PTR points to a CTXSW block (documented below)
+        uint64_t engine_wfi_ptr:52; // NV_RAMIN_ENGINE_WFI_PTR_LO/_HI; off 132--133
+        uint32_t engine_wfi_veid:6; // NV_RAMIN_ENGINE_WFI_VEID; off 134; VEID == Subcontext ID
+         uint32_t :26;
+        uint32_t pasid:20; // NV_RAMIN_PASID; off 135; "Process Address Space ID"
+         uint32_t :11;
+        bool enable_ats:1;
+         uint32_t pad6[30];
+        uint64_t subcontext_pdb_valid; // NV_RAMIN_SC_PDB_VALID; off 166-167
+        subcontext_ctrl_t subcontext[64]; // NV_RAMIN_SC_*; off 168-424
+} __attribute__((packed)) instance_ctrl_t;
+// Context types
+enum CTXSW_TYPE {
+        CTXSW_UNDEFINED = 0x0,
+        CTXSW_OPENGL = 0x8,
+        CTXSW_DX9 = 0x10,
+        CTXSW_DX10 = 0x11,
+        CTXSW_DX11 = 0x12,
+        CTXSW_COMPUTE = 0x20,
+        CTXSW_HEADER = 0x21 // A per-subcontext header
+};
+static inline const char *ctxsw_type_to_text(enum CTXSW_TYPE t) {
+        switch (t) {
+                case CTXSW_UNDEFINED:
+                        return "[None]";
+                case CTXSW_OPENGL:
+                        return "OpenGL";
+                case CTXSW_DX9:
+                case CTXSW_DX10:
+                case CTXSW_DX11:
+                        return "DirectX";
+                case CTXSW_COMPUTE:
+                        return "Compute";
+                case CTXSW_HEADER:
+                        return "Header";
+                default:
+                        return "UNKNOWN";
+        }
+}
+// Preemption modes:
+// WFI: Wait For Idle (preempt on idle)
+// CTA: Cooperative Thread Array-level Preemption (preempt at end of block)
+// CILP: Compute-Instruction-Level Preemption (preempt at end of instruction)
+enum GRAPHICS_PREEMPT_TYPE {PREEMPT_WFI, PREEMPT_GFXP};
+enum COMPUTE_PREEMPT_TYPE {_PREEMPT_WFI, PREEMPT_CTA, PREEMPT_CILP};
+static inline const char *compute_preempt_type_to_text(enum COMPUTE_PREEMPT_TYPE t) {
+        switch (t) {
+                case PREEMPT_WFI:
+                        return "WFI";
+                case PREEMPT_CTA:
+                        return "CTA";
+                case PREEMPT_CILP:
+                        return "CILP";
+                default:
+                        return "INVALID";
+        }
+}
+static inline const char *graphics_preempt_type_to_text(enum COMPUTE_PREEMPT_TYPE t) {
+        switch (t) {
+                case PREEMPT_WFI:
+                        return "WFI";
+                case PREEMPT_GFXP:
+                        return "GFXP";
+                default:
+                        return "INVALID";
+        }
+}
+/* ConTeXt SWitch control block (CTXSW)
+  Support: Maxwell*, Pascal**, Volta, Turing, Ampere, Ada
+   *Nothing except for CONTEXT_ID and TYPE
+   **Except as noted
+  See also: manuals/volta/gv100/dev_ctxsw.ref.txt in open-gpu-doc
+            and hw_ctxsw_prog_*.h in nvgpu
+*/
+// (Note that this layout changes some generation-to-generation)
+typedef struct context_switch_block {
+         uint32_t pad[3];
+        enum CTXSW_TYPE type:6; // Unused except when type CTXSW_HEADER?
+         uint32_t :26;
+         uint32_t pad2[26];
+        // The context buffer ptr fields are in an opposite-of-typical order, so we
+        // can't merge them into a single context_buffer_ptr field.
+        uint32_t context_buffer_ptr_hi; // Volta+ only
+        uint32_t context_buffer_ptr_lo; // Volta+ only
+        enum GRAPHICS_PREEMPT_TYPE graphics_preemption_options:32;
+        enum COMPUTE_PREEMPT_TYPE compute_preemption_options:32;
+         uint32_t pad3[18];
+        uint32_t num_wfi_save_operations;
+        uint32_t num_cta_save_operations;
+        uint32_t num_gfxp_save_operations;
+        uint32_t num_cilp_save_operations;
+         uint32_t pad4[4];
+        uint32_t context_id;
+        // [There are more fields not yet added here.]
+} __attribute__((packed)) context_switch_ctrl_t;
 /* VRAM Information
  If ECC is disabled:
@@ -1452,6 +1691,12 @@ static inline uint64_t memory_range_to_bytes(memory_range_t range) {
 /* Begin nvdebug types and functions */
+// __iomem is only defined when building as a kernel module, so conditionally
+// define it to allow including this header outside the kernel.
+#ifndef __iomem
+#define __iomem
+#endif
 // Vendor ID for PCI devices manufactured by NVIDIA
 #define NV_PCI_VENDOR 0x10de
 struct nvdebug_state {
@@ -1474,6 +1719,10 @@ struct nvdebug_state {
        struct platform_device *platd;
        // Pointer to generic device struct (both platform and pcie devices)
        struct device *dev;
+#ifdef __KERNEL__
+        // List used by mmu.c to track allocated pages for page directories/tables
+        struct list_head pd_allocs;
+#endif
 };
 // This disgusting macro is a crutch to work around the fact that runlists were
@@ -1542,7 +1791,19 @@ int get_runlist_iter(
        struct runlist_iter *rl_iter /* out */);
 int preempt_tsg(struct nvdebug_state *g, uint32_t rl_id, uint32_t tsg_id);
 int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id);
-int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id);
+int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id, uint32_t off);
+instance_ctrl_t *instance_deref(
+        struct nvdebug_state *g,
+        uint64_t instance_addr,
+        enum INST_TARGET instance_target);
+context_switch_ctrl_t *get_ctxsw(
+        struct nvdebug_state *g,
+        instance_ctrl_t *inst);
+int set_channel_preemption_mode(
+        struct nvdebug_state *g,
+        uint32_t chan_id,
+        uint32_t rl_id,
+        enum COMPUTE_PREEMPT_TYPE mode);
 // Defined in mmu.c
 uint64_t search_page_directory(
@@ -1550,11 +1811,33 @@ uint64_t search_page_directory(
        page_dir_config_t pd_config,
        uint64_t addr_to_find,
        enum INST_TARGET addr_to_find_aperture);
+int translate_page_directory(
+        struct nvdebug_state *g,
+        page_dir_config_t pd_config,
+        uint64_t addr_to_find,
+        uint64_t *found_addr /* out */,
+        enum INST_TARGET *found_aperture /* out */);
+int map_page_directory(
+        struct nvdebug_state *g,
+        page_dir_config_t pd_config,
+        uint64_t paddr_to_map,
+        uint64_t vaddr_to_find,
+        enum INST_TARGET paddr_target,
+        bool huge_page);
+int gc_page_directory(
+        struct nvdebug_state *g,
+        bool force);
 uint64_t search_v1_page_directory(
        struct nvdebug_state *g,
        page_dir_config_t pd_config,
        uint64_t addr_to_find,
        enum INST_TARGET addr_to_find_aperture);
+int translate_v1_page_directory(
+        struct nvdebug_state *g,
+        page_dir_config_t pd_config,
+        uint64_t addr_to_find,
+        uint64_t *found_addr /* out */,
+        enum INST_TARGET *found_aperture /* out */);
 // Defined in bus.c
 int addr_to_pramin_mut(struct nvdebug_state *g, uint64_t addr, enum INST_TARGET target);
 int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd /* out */);
diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index 3a10e13..c0cfa63 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -15,7 +15,7 @@
 // Enable to intercept and log GPU interrupts. Historically used to benchmark
 // interrupt latency.
-#define INTERRUPT_DEBUG 0
+#define INTERRUPT_DEBUG
 // MIT is GPL-compatible. We need to be GPL-compatible for symbols like
 // platform_bus_type or bus_find_device_by_name...
@@ -28,12 +28,20 @@ extern struct file_operations runlist_file_ops;
 extern struct file_operations preempt_tsg_file_ops;
 extern struct file_operations disable_channel_file_ops;
 extern struct file_operations enable_channel_file_ops;
+extern struct file_operations wfi_preempt_channel_file_ops;
+extern struct file_operations cta_preempt_channel_file_ops;
+extern struct file_operations cil_preempt_channel_file_ops;
 extern struct file_operations resubmit_runlist_file_ops;
+extern struct file_operations preempt_runlist_file_ops;
+extern struct file_operations ack_bad_tsg_file_ops;
+extern struct file_operations map_mem_chid_file_ops;
+extern struct file_operations map_mem_ctxid_file_ops;
 extern struct file_operations switch_to_tsg_file_ops;
 // device_info_procfs.c
 extern struct file_operations device_info_file_ops;
 extern struct file_operations nvdebug_read_reg32_file_ops;
 extern struct file_operations nvdebug_read_reg_range_file_ops;
+extern struct file_operations nvdebug_read_part_file_ops;
 extern struct file_operations local_memory_file_ops;
 // copy_topology_procfs.c
 extern struct file_operations copy_topology_file_ops;
@@ -71,9 +79,271 @@ const struct file_operations* compat_ops(const struct file_operations* ops) {
 }
 #endif
-#if INTERRUPT_DEBUG
+#ifdef INTERRUPT_DEBUG
+void nvdebug_fifo_intr(struct nvdebug_state *g) {
+        uint32_t fifo_intr_mask;// = nvdebug_readl(g, 0x02100); // PFIFO_INTR_0
+        fifo_intr_mask = nvdebug_readl(g, 0x02100); // PFIFO_INTR_0
+        if (fifo_intr_mask & 1 << 0)
+                printk(KERN_INFO "[nvdebug]   - Interrupt BIND_ERROR.\n");
+        if (fifo_intr_mask & 1 << 1)
+                printk(KERN_INFO "[nvdebug]   - Interrupt CTXSW_TIMEOUT.\n");
+        if (fifo_intr_mask & 1 << 4)
+                printk(KERN_INFO "[nvdebug]   - Interrupt RUNLIST_IDLE.\n");
+        if (fifo_intr_mask & 1 << 5)
+                printk(KERN_INFO "[nvdebug]   - Interrupt RUNLIST_AND_ENG_IDLE.\n");
+        if (fifo_intr_mask & 1 << 6)
+                printk(KERN_INFO "[nvdebug]   - Interrupt RUNLIST_ACQUIRE.\n");
+        if (fifo_intr_mask & 1 << 7)
+                printk(KERN_INFO "[nvdebug]   - Interrupt RUNLIST_ACQUIRE_AND_ENG_IDLE.\n");
+        if (fifo_intr_mask & 1 << 8)
+                printk(KERN_INFO "[nvdebug]   - Interrupt SCHED_ERROR.\n");
+        if (fifo_intr_mask & 1 << 16)
+                printk(KERN_INFO "[nvdebug]   - Interrupt CHSW_ERROR.\n");
+        if (fifo_intr_mask & 1 << 23)
+                printk(KERN_INFO "[nvdebug]   - Interrupt MEMOP_TIMEOUT.\n");
+        if (fifo_intr_mask & 1 << 24)
+                printk(KERN_INFO "[nvdebug]   - Interrupt LB_ERROR.\n");
+        if (fifo_intr_mask & 1 << 25) // OLD; Pascal
+                printk(KERN_INFO "[nvdebug]   - Interrupt REPLAYABLE_FAULT_ERROR.\n");
+        if (fifo_intr_mask & 1 << 27) // OLD; Pascal
+                printk(KERN_INFO "[nvdebug]   - Interrupt DROPPED_MMU_FAULT.\n");
+        if (fifo_intr_mask & 1 << 28) { // On Pascal, this is MMU_FAULT
+                if (g->chip_id <= NV_CHIP_ID_VOLTA) // MMU_FAULT on Pascal (nvgpu, l4t/l4t-r28.1:drivers/gpu/nvgpu/include/nvgpu/hw/gp10b/hw_fifo_gp10b.h)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt MMU_FAULT.\n");
+                else // Repurposed starting with Turing: open-gpu-doc/manuals/turing/tu104/dev_fifo.ref.txt
+                        printk(KERN_INFO "[nvdebug]   - Interrupt TSG_PREEMPT_COMPLETE.\n");
+        }
+        if (fifo_intr_mask & 1 << 29)
+                printk(KERN_INFO "[nvdebug]   - Interrupt PBDMA_INTR.\n");
+        if (fifo_intr_mask & 1 << 30) {
+                printk(KERN_INFO "[nvdebug]   - Interrupt RUNLIST_EVENT.\n");
+                uint32_t fifo_runlist_intr_mask = nvdebug_readl(g, 0x02A00); // PFIFO_INTR_RUNLIST
+                printk(KERN_INFO "[nvdebug]     - Event %#x.\n", fifo_runlist_intr_mask);
+        }
+        if (fifo_intr_mask & 1 << 31)
+                printk(KERN_INFO "[nvdebug]   - Interrupt CHANNEL_INTR.\n");
+}
 irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) {
-        printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num);
+        struct nvdebug_state *g = dev;
+        u64 time = ktime_get_raw_ns(); // CLOCK_MONOTONTIC_RAW
+        // NV_PMC_INTR does not exist on Ada, so use NV_FUNC_PRIV_CPU_INTR_TOP
+        // Note that this also appears to exist on Turing
+        if (g->chip_id >= NV_CHIP_ID_TURING) {//AMPERE) {
+                int i;
+                // Despite being an indexed register, it is only documented to have on, and could only support two
+                uint32_t intr_mask0 = nvdebug_readl(g, NV_VIRTUAL_FUNCTION_FULL_PHYS_OFFSET + 0x1600); // NV_FUNC_PRIV_CPU_INTR_TOP(0)
+                uint32_t intr_mask1 = nvdebug_readl(g, NV_VIRTUAL_FUNCTION_FULL_PHYS_OFFSET + 0x1604); // NV_FUNC_PRIV_CPU_INTR_TOP(1)
+                printk(KERN_INFO "[nvdebug] Interrupt on IRQ %d with CPU_INTR_TOP(0) %#010x, ...(1) %#010x @ %llu.\n", irq_num, intr_mask0, intr_mask1, time);
+                for (i = 0; i < 8; i++) {
+                        uint32_t leaf = nvdebug_readl(g, NV_VIRTUAL_FUNCTION_FULL_PHYS_OFFSET + 0x1000 + i*4); // NV_FUNC_PRIV_CPU_INTR_LEAF(0) to ...(7)
+                        if (leaf)
+                                printk(KERN_INFO "[nvdebug] - Interrupt leaf %d: %#010x\n", i, leaf);
+                        // 131-133 & 64 are faults on tu104??? (open-gpu-doc/manuals/turing/tu104/pri_mmu_hub.ref.txt)
+                        if (136 / 32 == i && 1 << (136 % 32) & leaf) // PFIFO0 ga100
+                                printk(KERN_INFO "[nvdebug] - Interrupt on PFIFO0.\n");
+                        if (137 / 32 == i && 1 << (137 % 32) & leaf) // PFIFO1 ga100
+                                printk(KERN_INFO "[nvdebug] - Interrupt on PFIFO1.\n");
+                        if (148 / 32 == i && 1 << (148 % 32) & leaf) // TIMER ga100
+                                printk(KERN_INFO "[nvdebug] - Interrupt on PTIMER.\n");
+                        if (152 / 32 == i && 1 << (152 % 32) & leaf) // PMU ga100
+                                printk(KERN_INFO "[nvdebug] - Interrupt on PMU.\n");
+                        if (156 / 32 == i && 1 << (156 % 32) & leaf) { // PBUS ga100
+                                printk(KERN_INFO "[nvdebug] - Interrupt on PBUS.\n");
+                                uint32_t bus_intr = nvdebug_readl(g, 0x1100); // BUS_INTR_0
+                                if (bus_intr & 1 << 2) {
+                                        // use timer_pri_timeout_save_0_r
+                                        uint32_t SAVE_0 = nvdebug_readl(g, 0x00009084); // NV_PTIMER_PRI_TIMEOUT_SAVE_0
+                                        printk(KERN_INFO "[nvdebug]   - Interrupt PRI_FECSERR on %s to address %#010x %stargeting FECS.\n", SAVE_0 & 0x2 ? "write" : "read", SAVE_0 & 0x00fffffc, SAVE_0 & 0x80000000 ? "" : "not ");
+                                        uint32_t SAVE_1 = nvdebug_readl(g, 0x00009088); // NV_PTIMER_PRI_TIMEOUT_SAVE_1
+                                        if (SAVE_1)
+                                                printk(KERN_INFO "[nvdebug]     Data written: %#010x\n", SAVE_1);
+                                        uint32_t errcode = readl(g->regs + 0x0000908C); // NV_PTIMER_PRI_TIMEOUT_FECS_ERRCODE
+                                        if (errcode)
+                                                printk(KERN_INFO "[nvdebug]     FECS Error Code: %#010x\n", errcode);
+                                        // badf5040 is a "client error" (0) of "no such address" (40)
+                                        // See linux-nvgpu/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_ga10b_fusa.c
+                                        // for how to decode.
+                                }
+                                if (bus_intr & 1 << 3)
+                                        printk(KERN_INFO "[nvdebug]   - Interrupt PRI_TIMEOUT.\n");
+                                if (bus_intr & 1 << 4)
+                                        printk(KERN_INFO "[nvdebug]   - Interrupt FB_REQ_TIMEOUT.\n");
+                                if (bus_intr & 1 << 5)
+                                        printk(KERN_INFO "[nvdebug]   - Interrupt FB_ACK_TIMEOUT.\n");
+                                if (bus_intr & 1 << 6)
+                                        printk(KERN_INFO "[nvdebug]   - Interrupt FB_ACK_EXTRA.\n");
+                                if (bus_intr & 1 << 7)
+                                        printk(KERN_INFO "[nvdebug]   - Interrupt FB_RDATA_TIMEOUT.\n");
+                                if (bus_intr & 1 << 8)
+                                        printk(KERN_INFO "[nvdebug]   - Interrupt FB_RDATA_EXTRA.\n");
+                                if (bus_intr & 1 << 26)
+                                        printk(KERN_INFO "[nvdebug]   - Interrupt SW.\n");
+                                if (bus_intr & 1 << 27)
+                                        printk(KERN_INFO "[nvdebug]   - Interrupt POSTED_DEADLOCK_TIMEOUT.\n");
+                                if (bus_intr & 1 << 28)
+                                        printk(KERN_INFO "[nvdebug]   - Interrupt MPMU.\n");
+                                if (bus_intr & 1 << 31)
+                                        printk(KERN_INFO "[nvdebug]   - Interrupt ACCESS_TIMEOUT.\n");
+                        }
+                        if (158 / 32 == i && 1 << (158 % 32) & leaf) // PRIV_RING ga100
+                                printk(KERN_INFO "[nvdebug] - Interrupt on PRIV_RING.\n");
+                        if (192 / 32 == i && 1 << (192 % 32) & leaf) // LEGACY_ENGINE_STALL ga100
+                                printk(KERN_INFO "[nvdebug] - Interrupt on LEGACY_ENGINE_STALL.\n");
+                        if (160 / 32 == i && 1 << (160 % 32) & leaf) { // (likely) rl0 ga100
+                                printk(KERN_INFO "[nvdebug] - Interrupt on RUNLIST0.\n");
+                                uint32_t off;
+                                get_runlist_ram(g, 0, &off);
+                                uint32_t rl_intr = nvdebug_readl(g, off+0x100); 
+                                printk(KERN_INFO "[nvdebug]   - RUNLIST_INTR_0: %#x\n", rl_intr);
+                                if (1 << 12 && rl_intr) { // BAD_TSG
+                                        printk(KERN_INFO "[nvdebug]    - BAD_TSG: %#x\n", nvdebug_readl(g, off+0x174));
+                                }
+                        }
+                        // Also getting 160, 161, and 162
+                        //uint32_t off;
+                        //get_runlist_ram(g, 12, &off);
+                        //printk(KERN_INFO "[nvdebug] - rl10 vector id 0 is %x\n", nvdebug_readl(g, off+0x160)); // NV_RUNLIST_INTR_VECTORID(0)
+                        // 160 is rl0 (C/G, LCE0, LCE1) vector id 0
+                        // 168 is rl11 (LCE3) vector id 0
+                        // 169 is rl12 (LCE4) vector id 0
+                        // 171 is rl1 (SEC) vector id 0
+                        // 176 is rl10 (LCE2) vector id 0
+                        // 224 is rl0 vector id 1
+                        // Only some interrupt vectors are hardcoded
+                }
+                // each subtree has two leafs? Each bit at the top corresponds to a subtree?
+                // So, if bit 0 is set, that means subtree 0 (concept) and leaves 0 and 1
+                // So, if bit 1 is set, that means subtree 1 (concept) and leaves 2 and 3
+                // the #define'd interrupt vectors all seem to fall in the lower leaf of subtree 2,
+                // except for INTR_HUB_ACCESS_CNTR_INTR_VECTOR is in the lower leaf of subtree 1
+                if (g->chip_id >= NV_CHIP_ID_AMPERE)
+                        return IRQ_NONE;
+        }
+        uint32_t intr_mask = nvdebug_readl(g, 0x0100); // NV_PMC_INTR
+        printk(KERN_INFO "[nvdebug] Interrupt on IRQ %d with MC_INTR %#010x @ %llu.\n", irq_num, intr_mask, time);
+        // IDs likely changed Ampere+
+        //if (g->chip_id >= NV_CHIP_ID_AMPERE) {
+                // CIC is central interrupt controller
+                // the u32 passed around nvgpu cic functions is one of the 
+                // enable is nvgpu_cic_mon_intr_stall_unit_config(unit)
+                // - Calls intr_stall_unit_config(unit)
+                //  - for ga, calls unit = ga10b_intr_map_mc_stall_unit_to_intr_unit(unit) (doesn't do much)
+                //  - for ga, calls nvgpu_cic_mon_intr_get_unit_info()
+                //   - Does *subtree = g->mc.intr_unit_info[unit].subtree;
+                //          *subtree_mask = g->mc.intr_unit_info[unit].subtree_mask;
+                //  - for ga, calls ga10b_intr_config() w/ subtree info
+                //uint32_t intr_stats = nvdebug_readl(g, 1600
+                //return IRQ_NONE;
+        //}
+        if (intr_mask & 1 << 5)
+                printk(KERN_INFO "[nvdebug] - Interrupt on LCE0.\n");
+        if (intr_mask & 1 << 6)
+                printk(KERN_INFO "[nvdebug] - Interrupt on LCE1.\n");
+        if (intr_mask & 1 << 7)
+                printk(KERN_INFO "[nvdebug] - Interrupt on LCE2.\n");
+        if (intr_mask & 1 << 8) {
+                printk(KERN_INFO "[nvdebug] - Interrupt on PFIFO.\n");
+                nvdebug_fifo_intr(g);
+        }
+        if (intr_mask & 1 << 9) {
+                printk(KERN_INFO "[nvdebug] - Interrupt on HUB.\n"); // "replayable_fault_pending" in nvgpu on Pascal, "HUB" on Volta+
+                // on tu104, if vector is one of the below set in new-style interrupt vector, then MMU fault
+                // - info_fault (134)
+                // - nonreplay_fault error (133)
+                // - nonreplay_fault notify (132)
+                // - replay_fault error (131)
+                // - replay_fault notify (64)
+                // (but the above fault vectors are configurable)
+                // if it's ecc_error, then not mmu error
+                // Default fault vectors from open-gpu-doc/manuals/turing/tu104/pri_mmu_hub.ref.txt
+                // Turing through (at least) Ampere (per nvgpu)
+                // on gv100, parse fb_niso_intr_r 0x00100a20U, where bits:
+                // - hub_access_counter notify (0)
+                // - hub_access_counter error (1)
+                // - replay_fault notify (27)
+                // - replay_fault overflow (28)
+                // - nonreplay_fault notify (29)
+                // - nonreplay_fault overflow (30)
+                // - other_fault notify (31)
+                // Volta through Turing (per nvgpu)
+                // On Pascal, it looks like it's a property of fifo_intr_0???
+                if (g->chip_id < NV_CHIP_ID_VOLTA)
+                        nvdebug_fifo_intr(g);
+        }
+        if (intr_mask & 1 << 10)
+                printk(KERN_INFO "[nvdebug] - Interrupt on LCE3.\n");
+        if (intr_mask & 1 << 11)
+                printk(KERN_INFO "[nvdebug] - Interrupt on LCE4.\n");
+        if (intr_mask & 1 << 12) {
+                printk(KERN_INFO "[nvdebug] - Interrupt on Graphics/Compute.\n");
+                // Kepler through (at least) Ampere
+                // From open-gpu-doc/manuals/volta/gv100/dev_graphics.ref.txt
+                uint32_t graph_intr_mask = nvdebug_readl(g, 0x400100); // NV_PGRAPH_INTR
+                if (graph_intr_mask & 1 << 0)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt NOTIFY.\n");
+                if (graph_intr_mask & 1 << 1)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt SEMAPHORE.\n");
+                if (graph_intr_mask & 1 << 4)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt ILLEGAL_METHOD.\n");
+                if (graph_intr_mask & 1 << 5)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt ILLEGAL_CLASS.\n");
+                if (graph_intr_mask & 1 << 6)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt ILLEGAL_NOTIFY.\n");
+                if (graph_intr_mask & 1 << 7)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt DEBUG_METHOD.\n");
+                if (graph_intr_mask & 1 << 8)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt FIRMWARE_METHOD.\n");
+                if (graph_intr_mask & 1 << 16)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt BUFFER_NOTIFY.\n");
+                if (graph_intr_mask & 1 << 19)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt FECS_ERROR.\n");
+                if (graph_intr_mask & 1 << 20)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt CLASS_ERROR.\n");
+                if (graph_intr_mask & 1 << 21)
+                        printk(KERN_INFO "[nvdebug]   - Interrupt EXCEPTION.\n");
+        }
+        if (intr_mask & 1 << 13)
+                printk(KERN_INFO "[nvdebug] - Interrupt on PFB.\n");
+        if (intr_mask & 1 << 15)
+                printk(KERN_INFO "[nvdebug] - Interrupt on SEC.\n");
+        if (intr_mask & 1 << 16)
+                printk(KERN_INFO "[nvdebug] - Interrupt on NVENC0.\n");
+        if (intr_mask & 1 << 17)
+                printk(KERN_INFO "[nvdebug] - Interrupt on NVDEC0.\n");
+        if (intr_mask & 1 << 18)
+                printk(KERN_INFO "[nvdebug] - Interrupt on THERMAL.\n");
+        if (intr_mask & 1 << 19)
+                printk(KERN_INFO "[nvdebug] - Interrupt on HDACODEC.\n");
+        if (intr_mask & 1 << 20)
+                printk(KERN_INFO "[nvdebug] - Interrupt on PTIMER.\n");
+        if (intr_mask & 1 << 21)
+                printk(KERN_INFO "[nvdebug] - Interrupt on PMGR.\n");
+        if (intr_mask & 1 << 22)
+                printk(KERN_INFO "[nvdebug] - Interrupt on IOCTRL.\n");
+        if (intr_mask & 1 << 23)
+                printk(KERN_INFO "[nvdebug] - Interrupt on DFD.\n");
+        if (intr_mask & 1 << 24)
+                printk(KERN_INFO "[nvdebug] - Interrupt on PMU.\n");
+        if (intr_mask & 1 << 25)
+                printk(KERN_INFO "[nvdebug] - Interrupt on LTC.\n");
+        if (intr_mask & 1 << 26)
+                printk(KERN_INFO "[nvdebug] - Interrupt on PDISP.\n");
+        if (intr_mask & 1 << 27)
+                printk(KERN_INFO "[nvdebug] - Interrupt on GSP.\n");
+        if (intr_mask & 1 << 28)
+                printk(KERN_INFO "[nvdebug] - Interrupt on PBUS.\n");
+        if (intr_mask & 1 << 29)
+                printk(KERN_INFO "[nvdebug] - Interrupt on XVE.\n");
+        if (intr_mask & 1 << 30)
+                printk(KERN_INFO "[nvdebug] - Interrupt on PRIV_RING.\n");
+        if (intr_mask & 1 << 30)
+                printk(KERN_INFO "[nvdebug] - Interrupt on SOFTWARE.\n");
        return IRQ_NONE; // We don't actually handle any interrupts. Pass them on.
 }
 #endif // INTERRUPT_DEBUG
@@ -135,6 +405,7 @@ int probe_and_cache_devices(void) {
                g_nvdebug_state[i].pcid = NULL;
                g_nvdebug_state[i].platd = platd;
                g_nvdebug_state[i].dev = dev;
+                INIT_LIST_HEAD(&g_nvdebug_state[i].pd_allocs);
                // Don't check Chip ID until everything else is initalized
                ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
                if (ids.raw == -1) {
@@ -152,6 +423,11 @@ int probe_and_cache_devices(void) {
                mc_boot_0_t ids;
                g_nvdebug_state[i].g = NULL;
                // Map BAR0 (GPU control registers)
+                // XXX: Don't use pci_iomap. This adds support for I/O registers, but we do
+                //      not use the required ioread/write functions for those regions. We
+                //      should use pci_ioremap_bar, which is explictly for MMIO regions.
+                // pci_ioremap_bar -> ioremap_nocache (all platforms)
+                // pci_iomap -> ioremap_nocache (on x86)
                g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0);
                if (!g_nvdebug_state[i].regs) {
                        pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n");
@@ -163,9 +439,14 @@ int probe_and_cache_devices(void) {
                // (vesafb may map the top half for display)
                if (!g_nvdebug_state[i].bar3)
                        g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2);
+                // Observed on H100, BAR2, moved it BAR3, was moved to BAR4, and BAR1
+                // was moved to BAR2.
+                if (!g_nvdebug_state[i].bar3)
+                        g_nvdebug_state[i].bar3 = pci_iomap(pcid, 4, 0);
                g_nvdebug_state[i].pcid = pcid;
                g_nvdebug_state[i].platd = NULL;
                g_nvdebug_state[i].dev = &pcid->dev;
+                INIT_LIST_HEAD(&g_nvdebug_state[i].pd_allocs);
                // Don't check Chip ID until everything else is initalized
                ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
                if (ids.raw == -1) {
@@ -175,9 +456,17 @@ int probe_and_cache_devices(void) {
                g_nvdebug_state[i].chip_id = ids.chip_id;
                printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.",
                       ids.chip_id, ARCH2NAME(ids.architecture));
-#if INTERRUPT_DEBUG
+#ifdef INTERRUPT_DEBUG
-                if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) {
+                // For this to work, you must also add IRQF_SHARED to the flags
-                        printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n");
+                // argument of the request_threaded_irq() call in the nvidia driver
+                // (file /usr/src/nvidia.../nvidia/nv.c and nv-msi.c with dkms)
+                // Then run:
+                //     sudo dkms remove nvidia-srv/VER -k $(uname -r)
+                //     sudo dkms install nvidia-srv/VER -k $(uname -r) --force
+                // where VER is the version of the nvidia module (eg. 535.216.03)
+                int err;
+                if ((err = request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", &g_nvdebug_state[i]))) {
+                        printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap, error %d\n", err);
                }
 #endif // INTERRUPT_DEBUG
                i++;
@@ -335,6 +624,40 @@ int __init nvdebug_init(void) {
                                                "enable_channel", 0222, chram_scope, compat_ops(&enable_channel_file_ops),
                                                (void*)last_runlist))
                                        goto out_nomem;
+                                // Create file `/proc/gpu#/runlist#/wfi_preempt_channel`, world writable
+                                // On Turing and older, `/proc/gpu#/wfi_preempt_channel`
+                                if (!proc_create_data(
+                                                "wfi_preempt_channel", 0222, chram_scope, compat_ops(&wfi_preempt_channel_file_ops),
+                                                (void*)last_runlist))
+                                        goto out_nomem;
+                                // Create file `/proc/gpu#/runlist#/cta_preempt_channel`, world writable
+                                // On Turing and older, `/proc/gpu#/cta_preempt_channel`
+                                if (!proc_create_data(
+                                                "cta_preempt_channel", 0222, chram_scope, compat_ops(&cta_preempt_channel_file_ops),
+                                                (void*)last_runlist))
+                                        goto out_nomem;
+                                // Compute-instruction-level (CIL) preemption is only available on Pascal+
+                                if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
+                                        // Create file `/proc/gpu#/runlist#/cil_preempt_channel`, world writable
+                                        // On Turing and older, `/proc/gpu#/cil_preempt_channel`
+                                        if (!proc_create_data(
+                                                        "cil_preempt_channel", 0222, chram_scope, compat_ops(&cil_preempt_channel_file_ops),
+                                                        (void*)last_runlist))
+                                                goto out_nomem;
+                                }
+                                // Create files which enable on-GPU scheduling (Pascal+)
+                                if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
+                                        // Create file `/proc/gpu#/map_mem_chid`, root writable
+                                        if (!proc_create_data(
+                                                        "map_mem_chid", 0200, chram_scope, compat_ops(&map_mem_chid_file_ops),
+                                                        (void*)last_runlist))
+                                                goto out_nomem;
+                                        // Create file `/proc/gpu#/map_mem_ctxid`, root writable
+                                        if (!proc_create_data(
+                                                        "map_mem_ctxid", 0222, rl_dir, compat_ops(&map_mem_ctxid_file_ops),
+                                                        (void*)last_runlist))
+                                                goto out_nomem;
+                                }
                        }
                        // Create file `/proc/gpu#/runlist#/runlist`, world readable
                        if (!proc_create_data(
@@ -346,16 +669,26 @@ int __init nvdebug_init(void) {
                                        "switch_to_tsg", 0222, rl_dir, compat_ops(&switch_to_tsg_file_ops),
                                        (void*)last_runlist))
                                goto out_nomem;
+                        /* On the TU104, the context scheduler (contained in the Host, aka
+                         * PFIFO, unit) has been observed to sometimes to fail to schedule TSGs
+                         * containing re-enabled channels. Resubmitting the runlist
+                         * configuration appears to remediate this condition, and so this API
+                         * is exposed to help reset GPU scheduling as necessary.
+                         */
+                        // Create file `/proc/gpu#/resubmit_runlist`, world writable
+                        if (!proc_create_data(
+                                        "resubmit_runlist", 0222, rl_dir, compat_ops(&resubmit_runlist_file_ops),
+                                        (void*)device_id))
+                                goto out_nomem;
                } while (last_runlist-- > 0);
-                /* On the TU104, the context scheduler (contained in the Host, aka
+                // Create file `/proc/gpu#/preempt_runlist`, world writable
-                 * PFIFO, unit) has been observed to sometimes to fail to schedule TSGs
-                 * containing re-enabled channels. Resubmitting the runlist
-                 * configuration appears to remediate this condition, and so this API
-                 * is exposed to help reset GPU scheduling as necessary.
-                 */
-                // Create file `/proc/gpu#/resubmit_runlist`, world writable
                if (!proc_create_data(
-                                "resubmit_runlist", 0222, dir, compat_ops(&resubmit_runlist_file_ops),
+                                "preempt_runlist", 0222, dir, compat_ops(&preempt_runlist_file_ops),
+                                (void*)device_id))
+                        goto out_nomem;
+                // Create file `/proc/gpu#/ack_bad_tsg`, world writable
+                if (!proc_create_data(
+                                "ack_bad_tsg", 0222, dir, compat_ops(&ack_bad_tsg_file_ops),
                                (void*)device_id))
                        goto out_nomem;
                // Create file `/proc/gpu#/device_info`, world readable
@@ -394,6 +727,68 @@ int __init nvdebug_init(void) {
                                        (void*)NV_FUSE_GPC_GM107))
                                goto out_nomem;
                }
+                // Create file `/proc/gpu#/CWD_SM_ID#`, world readable (Maxwell+)
+                // Create file `/proc/gpu#/CWD_GPC_TPC_ID#`, world readable (Maxwell+)
+                // - 6 entries on Maxwell (nvgpu)
+                // - 16 entries on Pascal through Ampere (at least) (nvgpu, open-gpu-doc)
+                // - 24 entries on Hopper through Ada (at least) (XXXX)
+                // XXX: Only working while a context is active
+                // XXX: Needed for libsmctrl2; hacky
+                // Tested on GP104, TU102, GV100, AD102
+                if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_HOPPER) {
+                        char file_name[21];
+                        long i;
+                        for (i = 0; i < 24; i++) {
+                                snprintf(file_name, 20, "CWD_SM_ID%ld", i);
+                                if (!proc_create_data(
+                                                file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+                                                (void*)(0x00405100+4*i))) // XXX: From XXXX
+                                        goto out_nomem;
+                                // 18 entries on Ada (RTX 6000 Ada)
+                                // Returns 0xbadf1201 if GPU not active
+                                snprintf(file_name, 20, "CWD_GPC_TPC_ID%ld", i);
+                                if (!proc_create_data(
+                                                file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+                                                // Nothing between this location and CWD_SM_ID
+                                                (void*)((0x00405000)+4*i))) // Found via reverse search from CWD_SM_ID location on Ada
+                                        goto out_nomem;
+                                // Nothing in the following 28 words (before 0x00405220)
+                        }
+                } else if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL) {
+                        char file_name[21];
+                        long i;
+                        union reg_range num_gpc_range;
+                        for (i = 0; i < 16; i++) {
+                                snprintf(file_name, 20, "CWD_SM_ID%ld", i);
+                                if (!proc_create_data(
+                                                file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+                                                (void*)(0x00405ba0+4*i))) // NV_PGRAPH_PRI_CWD_SM_ID(i)
+                                        goto out_nomem;
+                                // ? entries on Maxwell
+                                // 8 entries on Pascal (test)
+                                // 16 entries on Volta through Ampere (open-gpu-doc)
+                                // Returns 0 if GPU ont active
+                                snprintf(file_name, 20, "CWD_GPC_TPC_ID%ld", i);
+                                if (!proc_create_data(
+                                                file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+                                                (void*)(0x00405b60+4*i))) // NV_PGRAPH_PRI_CWD_GPC_TPC_ID(i)
+                                        goto out_nomem;
+                        }
+                        num_gpc_range.offset = 0x00405b00; // NV_PGRAPH_PRI_CWD_FS
+                        // Lower eight bits of register are _NUM_GPCS
+                        num_gpc_range.start_bit = 0;
+                        num_gpc_range.stop_bit = 8;
+                        if (!proc_create_data(
+                                        "CWD_FS_NUM_GPCS", 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
+                                        (void*)(num_gpc_range.raw)))
+                                goto out_nomem;
+                        num_gpc_range.start_bit = 8;
+                        num_gpc_range.stop_bit = 16;
+                        if (!proc_create_data(
+                                        "CWD_FS_NUM_TPCS", 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
+                                        (void*)(num_gpc_range.raw)))
+                                goto out_nomem;
+                }
                // Create file `/proc/gpu#/local_memory`, world readable (Pascal+)
                if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
                        if (!proc_create_data(
@@ -414,6 +809,50 @@ int __init nvdebug_init(void) {
                                        (void*)NV_CE_PCE_MAP))
                                goto out_nomem;
                }
+                // Create files exposing subcontext partitioning (Volta+)
+                // TODO: Make this not a hack with undocumented magic numbers
+                if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_VOLTA) {
+                        char file_name[21];
+                        long i;
+                        // Create file `/proc/gpu#/partition_ctl`, world readable
+                        if (!proc_create_data(
+                                        "partition_ctl", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+                                        (void*)0x00405b2c))
+                                goto out_nomem;
+                        // Create file `/proc/gpu#/partition_data`, world readable
+                        if (!proc_create_data(
+                                        "partition_data", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+                                        (void*)0x00405b30))
+                                goto out_nomem;
+                        // Create file `/proc/gpu#/partition_data#`, world readable
+                        for (i = 0; i < 64; i++) {
+                                snprintf(file_name, 20, "partition_data%ld", i);
+                                if (!proc_create_data(
+                                                file_name, 0444, dir, compat_ops(&nvdebug_read_part_file_ops),
+                                                (void*)i))
+                                        goto out_nomem;
+                        }
+                        // For debugging what MPS is changing
+                        // Create file `/proc/gpu#/CWD_CG0`, world readable
+                        if (!proc_create_data(
+                                        "CWD_CG0", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+                                        (void*)0x00405bf0))
+                                goto out_nomem;
+                        // Create file `/proc/gpu#/CWD_CG1`, world readable
+                        if (!proc_create_data(
+                                        "CWD_CG1", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+                                        (void*)0x00405bf4))
+                                goto out_nomem;
+                        // Create file `/proc/gpu#/CWD_GPC_TPC_ID#`, world readable
+                        // This does not appear to work on Hopper. Works on Ampere.
+                        /*for (i = 0; i < 16; i++) {
+                                snprintf(file_name, 20, "CWD_GPC_TPC_ID%ld", i);
+                                if (!proc_create_data(
+                                                file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+                                                (void*)(0x00405b60+4*i)))
+                                        goto out_nomem;
+                        }*/
+                }
        }
        // (See Makefile if you want to know the origin of GIT_HASH.)
        printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n");
@@ -439,16 +878,19 @@ static void __exit nvdebug_exit(void) {
                char device_id[7];
                snprintf(device_id, 7, "gpu%d", g_nvdebug_devices);
                remove_proc_subtree(device_id, NULL);
+                // Force-free associated allocations
                g = &g_nvdebug_state[g_nvdebug_devices];
+                gc_page_directory(g, true);
                // Free BAR mappings for PCIe devices
                if (g && g->pcid) {
+#ifdef INTERRUPT_DEBUG
+                        // IRQ handler uses g->regs, so free IRQ first
+                        free_irq(g->pcid->irq, g);
+#endif // INTERRUPT_DEBUG
                        if (g->regs)
                                pci_iounmap(g->pcid, g->regs);
                        if (g->bar2)
                                pci_iounmap(g->pcid, g->bar2);
-#if INTERRUPT_DEBUG
-                        free_irq(g->pcid->irq, g->pcid);
-#endif // INTERRUPT_DEBUG
                } else {
                        if (g->regs)
                                iounmap(g->regs);
diff --git a/nvdebug_linux.h b/nvdebug_linux.h
index 2ad4ce1..b232720 100644
--- a/nvdebug_linux.h
+++ b/nvdebug_linux.h
@@ -20,6 +20,11 @@ static inline struct gk20a *get_gk20a(struct device *dev) {
 #define pde_data PDE_DATA
 #endif
+// iommu_map() requires an extra parameter on Linux 6.3+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,3,0)
+#define iommu_map(a, b, c, d, e) iommu_map(a, b, c, d, e, GFP_KERNEL)
+#endif
 // We us the data field of the proc_dir_entry ("PDE" in this function) to store
 // our index into the g_nvdebug_state array
 static inline int seq2gpuidx(struct seq_file *s) {
diff --git a/runlist.c b/runlist.c
index 7bb2ee4..3076d27 100644
--- a/runlist.c
+++ b/runlist.c
@@ -1,19 +1,13 @@
 /* Copyright 2024 Joshua Bakita
 * Helpers for dealing with the runlist and other Host (PFIFO) registers
 */
+#include <linux/iommu.h>  // iommu_get_domain_for_dev() and iommu_iova_to_phys()
 #include <linux/printk.h> // For printk()
 #include <asm/errno.h> // For error defines
 #include <asm/io.h> // For phys_to_virt()
 #include "nvdebug.h"
-// Uncomment to, upon BAR2 access failure, return a PRAMIN-based runlist pointer
-// in get_runlist_iter(). In order for this pointer to remain valid, PRAMIN
-// **must** not be moved during runlist traversal.
-// The Jetson TX2 has no BAR2, and stores the runlist in VID_MEM, so this must
-// be enabled to print the runlist on the TX2.
-//#define FALLBACK_TO_PRAMIN
 /* Get RunList RAM (RLRAM) offset for a runlist from the device topology
  @param rl_id      Which runlist to obtain [numbered in order of appearance in
                    the device topology (PTOP) registers]
@@ -116,6 +110,7 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl
                runlist_len = submit.len;
                printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n",
                       rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw);
+                printk(KERN_INFO "[nvdebug] Runlist offset is %d\n", submit.offset);
                rl_iter->runlist_pri_base = runlist_pri_base;
        }
        // Return early on an empty runlist
@@ -130,6 +125,12 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl
                if ((err = get_bar2_pdb(g, &pd_config)) < 0)
                        goto attempt_pramin_access;
+                // XXX: PD version detection not working on Hopper [is_ver2 errantly (?) unset]
+                if (g->chip_id >= NV_CHIP_ID_HOPPER && g->chip_id < NV_CHIP_ID_ADA) {
+                        printk(KERN_WARNING "[nvdebug] V3 page tables do not currently work on Hopper! Mystery config: %llx\n", pd_config.raw);
+                        err = -EOPNOTSUPP;
+                        goto attempt_pramin_access;
+                }
                if (pd_config.is_ver2)
                        runlist_bar_vaddr = search_page_directory(g, pd_config, runlist_iova, TARGET_VID_MEM);
                else
@@ -233,7 +234,7 @@ int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id) {
 }
 // Read and write runlist configuration, triggering a resubmit
-int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) {
+int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id, uint32_t off) {
        // Necessary registers do not exist pre-Fermi
        if (g->chip_id < NV_CHIP_ID_FERMI)
                return -EOPNOTSUPP;
@@ -252,6 +253,9 @@ int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) {
                        return -EINVAL;
                if ((submit.raw = nvdebug_readq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id))) == -1)
                        return -EIO;
+                preempt_runlist(g, rl_id);
+                if (off != -1)
+                        submit.offset = off;
                nvdebug_writeq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id), submit.raw);
        } else {
                int err;
@@ -261,6 +265,9 @@ int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) {
                        return err;
                if ((submit.raw = nvdebug_readq(g, runlist_pri_base + NV_RUNLIST_SUBMIT_GA100)) == -1)
                        return -EIO;
+                preempt_runlist(g, rl_id);
+                if (off != -1)
+                        submit.offset = off;
                // On Ampere, this does not appear to trigger a preempt of the
                // currently-running channel (even if the currently running channel
                // becomes disabled), but will cause newly re-enabled channels
@@ -270,3 +277,255 @@ int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) {
        }
        return 0;
 }
+/* Get a CPU-accessible pointer to an arbitrary-address-space instance block
+  @param instance_addr  Address of instance block
+  @param intasce_target Aperture/taget of instance block address
+  @return A dereferencable KVA, NULL if not found, or an ERR_PTR-wrapped error
+  Note: The returned address will be a BAR2 or physical address, mapped into
+        kernel space, /not/ a PRAMIN-derived address. Thus, the returned
+        address will have an indefinite lifetime, and will be uneffected by use
+        of PRAMIN elsewhere (such as to read the CTXSW block).
+*/
+instance_ctrl_t *instance_deref(struct nvdebug_state *g, uint64_t instance_addr,
+                                enum INST_TARGET instance_target) {
+        if (!instance_addr || instance_target == TARGET_INVALID)
+                return ERR_PTR(-EINVAL);
+        if (instance_target == TARGET_VID_MEM) {
+                int err;
+                uint64_t inst_bar_vaddr;
+                page_dir_config_t pd_config;
+                // Only access VID_MEM via BAR2; do not fall back to PRAMIN
+                if (!g->bar2)
+                        return NULL;
+                // Find page tables which define how BAR2/3 offsets are translated to
+                // physical VID/SYS_MEM addresses.
+                if ((err = get_bar2_pdb(g, &pd_config)) < 0) {
+                        printk(KERN_ERR "[nvdebug] Error: Unable to access page directory "
+                               "configuration for BAR2/3. Error %d.\n", err);
+                        return ERR_PTR(err);
+                }
+                // Search the BAR2/3 page tables for the offset at which the instance
+                // block is mapped (reverse translation).
+                if (pd_config.is_ver2)
+                        inst_bar_vaddr = search_page_directory(g, pd_config, instance_addr, instance_target);
+                else
+                        inst_bar_vaddr = search_v1_page_directory(g, pd_config, instance_addr, instance_target);
+                if (!inst_bar_vaddr) {
+                        printk(KERN_WARNING "[nvdebug] Warning: Instance block %#018llx "
+                               "(%s) appears unmapped in BAR2/3.\n", instance_addr,
+                               target_to_text(instance_target));
+                        return NULL;
+                }
+                return g->bar2 + inst_bar_vaddr;
+        } else {
+                struct iommu_domain *dom;
+                // SYS_MEM addresses are physical addresses *from the perspective of
+                // the device* ("bus addresses"), and may not necessarially correspond
+                // to physical addresses from the perspective of the CPU. The I/O MMU
+                // is responsible for mapping bus addresses to CPU-relative physical
+                // addresses when there is no direct correspondence. If an I/O MMU is
+                // enabled on this GPU, ask it to translate the bus address to a
+                // CPU-relative physical address.
+                if ((dom = iommu_get_domain_for_dev(g->dev))) {
+                        // XXX: As of Aug 2024, this is not tested, so include extra logging
+                        printk(KERN_DEBUG "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#llx for instance block", instance_addr);
+                        if (!(instance_addr = iommu_iova_to_phys(dom, instance_addr))) {
+                                printk(KERN_ERR "[nvdebug] Error: I/O MMU failed to translate "
+                                       "%#018llx (%s) to a CPU-relative physical address.\n",
+                                       instance_addr, target_to_text(instance_target));
+                                return ERR_PTR(-EADDRNOTAVAIL);
+                        }
+                        printk(KERN_DEBUG " to physical address %#llx.\n", instance_addr);
+                }
+                // Convert from a physical address to a kernel virtual address (KVA)
+                return phys_to_virt(instance_addr);
+        }
+}
+/* Get a CPU-accessible pointer to the CTXSW block for a channel intance block
+  @param inst Dereferencable pointer to the start of a complete instance block
+  @return A dereferencable KVA, NULL if not found, or an ERR_PTR-wrapped error
+  Note: The returned address **will** be a PRAMIN-based address. Any changes to
+        PRAMIN **will** invalidate the returned pointer. `inst` **cannot** be a
+        pointer into the PRAMIN space.
+*/
+context_switch_ctrl_t *get_ctxsw(struct nvdebug_state *g,
+                                 instance_ctrl_t *inst) {
+        int err;
+        context_switch_ctrl_t *wfi = NULL;
+        uint64_t wfi_virt, wfi_phys, ctxsw_virt, ctxsw_phys;
+        enum INST_TARGET wfi_phys_aperture, ctxsw_phys_aperture;
+        // The WFI block contains a pointer to the CTXSW block, which contains the
+        // preemption mode configuration for the context. (As best I can tell, the WFI
+        // block is subcontext-specific, whereas the CTXSW block is context-wide.
+        wfi_virt = (uint64_t)inst->engine_wfi_ptr << 12;
+        // WFI may not be configured
+        if (!wfi_virt)
+                goto out;
+        // Determine the physical location of the WFI block
+        if (inst->engine_wfi_is_virtual) {
+                if (inst->pdb.is_ver2)
+                        err = translate_page_directory(g, inst->pdb, wfi_virt, &wfi_phys, &wfi_phys_aperture);
+                else
+                        err = translate_v1_page_directory(g, inst->pdb, wfi_virt, &wfi_phys, &wfi_phys_aperture);
+                if (err) {
+                        printk(KERN_ERR "[nvdebug] Critical: Inconsistent GPU state; WFI block "
+                               "pointer %#018llx (virt) cannot be found in process page tables! "
+                               "Translation error %d.\n", wfi_virt, -err);
+                        return ERR_PTR(-ENOTRECOVERABLE);
+                }
+        } else {
+                wfi_phys = (uint64_t)inst->engine_wfi_ptr << 12;
+                wfi_phys_aperture = inst->engine_wfi_target;
+        }
+        // Get a dereferencible pointer to the WFI block (the WFI and CTXSW blocks
+        // have not been observed as mapped in BAR2/3, so we use the PRAMIN window).
+        // Note: On Jetson boards, we could attempt to avoid PRAMIN since CTXSW is in
+        //       SYS_MEM, but this function will always need to use PRAMIN to work
+        //       around the WFI and CTXSW blocks not being accessible via BAR2/3 on
+        //       PCIe GPU, so always use PRAMIN for simplicity.
+        if ((wfi_phys = addr_to_pramin_mut(g, wfi_phys, wfi_phys_aperture)) == -1)
+                goto out;
+        wfi = g->regs + wfi_phys + NV_PRAMIN;
+// XXX
+//      return wfi;
+// End XXX
+        // While the WFI block uses the same layout as the context switch (CTXSW)
+        // control block, it is mostly unpopulated except for a few pointers on GPUs
+        // after Volta. This appears to be related to subcontexts, where each
+        // subcontext has its own WFI block containing a pointer to the overarching
+        // CTXSW block. Only attempt to find the overarching CTXSW block if at least
+        // one subcontext is enabled.
+        if (inst->subcontext_pdb_valid) {
+                // Subcontexts are Volta+-only. Volta only supports Page Table Ver. 2
+                if (!inst->pdb.is_ver2)
+                        return ERR_PTR(-ENOTRECOVERABLE);
+                // Obtain the address of the CXTSW block in this context
+                ctxsw_virt = wfi->context_buffer_ptr_hi;
+                ctxsw_virt <<= 32;
+                ctxsw_virt |= wfi->context_buffer_ptr_lo;
+                if (!ctxsw_virt) {
+                        printk(KERN_WARNING "[nvdebug] Warning: WFI block at %#018llx (phys) "
+                               "contains an empty context block pointer.\n", wfi_phys);
+                        goto out;
+                }
+                // All the pointers in the WFI block are virtual, so convert the CTXSW
+                // block pointer to a physical address. We should always be able to find a
+                // mapping for ctxsw_virt.
+                if ((err = translate_page_directory(g, inst->pdb, ctxsw_virt, &ctxsw_phys, &ctxsw_phys_aperture))) {
+                        printk(KERN_ERR "[nvdebug] Critical: Inconsistent GPU state; context "
+                               "block pointer %#018llx (virt) cannot be found in process page "
+                               "tables! Translation error %d.\n", ctxsw_virt, -err);
+                        return ERR_PTR(-ENOTRECOVERABLE);
+                }
+                // Get a dereferencible pointer to the CTXSW block (via PRAMIN; invalidates `wfi`)
+                if ((ctxsw_phys = addr_to_pramin_mut(g, ctxsw_phys, ctxsw_phys_aperture)) == -1)
+                        goto out;
+                return g->regs + ctxsw_phys + NV_PRAMIN;
+        } else {
+                // Without subcontexts, the WFI block is the CTXSW block (ex: Pascal)
+                return wfi;
+        }
+out:
+        return NULL;
+}
+/* Change the preemption type to be used on a context's budget expiration
+  @param chan_id As context IDs are hard to obtain and use, this function takes
+                 a channel ID and looks up and modifies the associated context.
+  @param rl_id   Which channel RAM address space is this channel ID in? (Not
+                 used on pre-Ampere GPUs.)
+  @param mode    Preemption mode to set.
+  @return 0 or -errno on error
+  Note: This change will not apply if the channel's context has running work,
+        or if the GPU is idle and this channel's context was last to run.
+        Please ensure some other task is running before calling this API.
+*/
+int set_channel_preemption_mode(struct nvdebug_state *g, uint32_t chan_id,
+                                uint32_t rl_id,
+                                enum COMPUTE_PREEMPT_TYPE mode) {
+        uint64_t instance_ptr = 0;
+        enum INST_TARGET instance_target;
+        instance_ctrl_t *inst = NULL;
+        context_switch_ctrl_t *ctxsw = NULL;
+        struct runlist_iter rl_iter;
+        uint32_t ctxsw_timeout_pri_base = NV_PFIFO_ENG_CTXSW_TIMEOUT;
+        // Obtain the instance block
+        if (g->chip_id < NV_CHIP_ID_AMPERE) {
+                // Pre-Ampere, Channel RAM includes instance block pointers
+                channel_ctrl_t chan;
+                if (chan_id > MAX_CHID)
+                        return -ERANGE;
+                if ((chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chan_id))) == -1)
+                        return -EIO;
+                instance_ptr = (uint64_t)chan.inst_ptr << 12;
+                instance_target = chan.inst_target;
+        } else {
+                // Starting with Ampere, instance block pointers are only included in
+                // runlist entries. Something like this could work on Maxwell+, but
+                // access via Channel RAM is more heavily-tested.
+                struct gv100_runlist_chan* chan;
+                int err;
+                loff_t pos = 0;
+                // Based off logic of switch_to_tsg_file_write() in runlist_procfs.c
+                if ((err = get_runlist_iter(g, rl_id, &rl_iter)))
+                        return err;
+                while (pos < rl_iter.len && !instance_ptr) {
+                        for_chan_in_tsg(g, chan, rl_iter.curr_entry) {
+                                if (chan_id == chid(g, chan)) {
+                                        // Channel entry found in runlist. Extract instance ptr.
+                                        instance_ptr = (uint64_t)chan->inst_ptr_hi << 32;
+                                        instance_ptr |= (uint64_t)inst_ptr_lo(g, chan) << 12;
+                                        instance_target = inst_target(g, chan);
+                                        break;
+                                }
+                        }
+                        pos += 1 + tsg_length(g, rl_iter.curr_entry);
+                        rl_iter.curr_entry = next_tsg(g, rl_iter.curr_entry);
+                }
+                // Context switch timeout configuration register was moved with Ampere+
+                ctxsw_timeout_pri_base = rl_iter.runlist_pri_base + NV_RUNLIST_ENGINE_CTXSW_TIMEOUT_CONFIG(0);
+        }
+        if (!instance_ptr)
+                return -ENOENT;
+        // Obtain an instance block pointer routed via BAR2 or SYS_MEM
+        inst = instance_deref(g, instance_ptr, instance_target);
+        if (IS_ERR_OR_NULL(inst))
+                return PTR_ERR(inst);
+        // Obtain pointer to CTXSW block routed via PRAMIN (the CTXSW block
+        // does not appear to be mapped into BAR2).
+        ctxsw = get_ctxsw(g, inst);
+        if (IS_ERR_OR_NULL(ctxsw))
+                return PTR_ERR(ctxsw);
+        ctxsw->compute_preemption_options = mode;
+        // If switching to a preemption mode that runs blocks or kernels non-
+        // -preemptively (CTA-level and WFI respectively), disable the context switch
+        // timeout. If switching to compute-instruction-level preemption (CILP),
+        // reenable it. Observed to be necessary on (at least) gv11b, tu102, and ga10b
+        // XXX: On ga10b (at least), the timeout configuration is reset on a resume
+        //      from suspend, overwriting the change made here. This causes a CTXSW
+        //      TIMEOUT interrupt to be triggered if any application tries to run
+        //      non-preemptively for longer than the timeout period (3100ms on gv11b
+        //      and ga10b).
+        if (g->chip_id >= NV_CHIP_ID_VOLTA) {
+                ctxsw_timeout_t timeout_config;
+                if ((timeout_config.raw = nvdebug_readl(g, ctxsw_timeout_pri_base)) == -1)
+                        return -EIO;
+                printk(KERN_DEBUG "[nvdebug] Previous Ctx. Sw. Timeout Configuration: period %d %s\n", timeout_config.period, timeout_config.enabled ? "enabled" : "disabled");
+                timeout_config.enabled = mode == PREEMPT_CILP;
+                nvdebug_writel(g, ctxsw_timeout_pri_base, timeout_config.raw);
+        }
+        return 0;
+}
diff --git a/runlist_procfs.c b/runlist_procfs.c
index b2159f6..a3a6df3 100644
--- a/runlist_procfs.c
+++ b/runlist_procfs.c
@@ -1,12 +1,117 @@
 #include <linux/seq_file.h> // For seq_* functions and types
 #include <linux/version.h>  // Macros to detect kernel version
+#include <linux/platform_device.h> // For platform_get_resource()
+#include <linux/pci.h> // For pci_resource_start()
+#include <linux/iommu.h> // For iommu_ functions
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,10,0)
+#include <linux/dma-map-ops.h> // For get_dma_ops()
+#endif
 #include "nvdebug_linux.h"
-// Uncomment to expand channel status information when printing the runlist
+// We cannot touch PRAMIN (via page table operations or ctxsw access) if we're
+// using it to walk the runlist
+//#ifndef FALLBACK_TO_PRAMIN
+// Uncomment to expand channel status, instance, and context information when
+// printing the runlist
 #define DETAILED_CHANNEL_INFO
+//#endif
 #ifdef DETAILED_CHANNEL_INFO
+// Print the channel instance and context swtich blocks
+// XXX: THIS IS UNSAFE ON KEPLER!
+// instance_deref() will call into the page table logic, which may move PRAMIN
+// PRAMIN appears heavily utilized by the driver on Bonham (at least), and
+// moving it causes problems.
+static int runlist_detail_seq_show_inst(struct seq_file *s, struct nvdebug_state *g, char *prefix, uint64_t instance_ptr, enum INST_TARGET instance_target) {
+        instance_ctrl_t *inst = NULL;
+        context_switch_ctrl_t *ctxsw = NULL;
+        int i;
+#ifdef FALLBACK_TO_PRAMIN
+        bar0_window_t win;
+        win.raw = nvdebug_readl(g, NV_XAL_EP_BAR0_WINDOW_BASE);
+        inst = g->regs + NV_PRAMIN + addr_to_pramin_mut(g, instance_ptr, instance_target);
+#else
+        if (IS_ERR(inst = instance_deref(g, instance_ptr, instance_target)))
+                return PTR_ERR(ctxsw);
+#endif // FALLBACK_TO_PRAMIN
+        // If unable to access instance block, skip
+        if (!inst)
+                return 0;
+        // Print the channel instance block
+        // As an ID, use upper 52 bits of the instance address (lower 12 are zero)
+        //seq_printf(s, "%s+- Inst %-13llx-+\n", prefix, instance_ptr >> 12);
+        seq_printf(s, "%s|= Instance Block ====|\n", prefix);
+        seq_printf(s, "%s| Target Engine:    %2d|\n", prefix, inst->fc_target);
+        seq_printf(s, "%s| Privileged:        %1d|\n", prefix, inst->fc_config_is_priv);
+        seq_printf(s, "%s| Channel VEID:     %2d|\n", prefix, inst->fc_chan_info_veid);
+        seq_printf(s, "%s| WFI PTR:            |\n", prefix);
+        seq_printf(s, "%s|   %#018llx|\n", prefix, (uint64_t)inst->engine_wfi_ptr << 12);
+        seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->engine_wfi_target));
+        seq_printf(s, "%s| Virtual address?   %d|\n", prefix, inst->engine_wfi_is_virtual);
+        seq_printf(s, "%s| WFI VEID:         %2d|\n", prefix, inst->engine_wfi_veid);
+        seq_printf(s, "%s| All PDB PTR:        |\n", prefix);
+        seq_printf(s, "%s|   %#018llx|\n", prefix,  (u64)inst->pdb.page_dir << 12);
+        seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->pdb.target));
+        seq_printf(s, "%s| %20s|\n", prefix, inst->pdb.is_volatile ? "volatile" : "non-volatile");
+//      seq_printf(s, "%s|raw:       %0#10lx|\n", prefix, inst->pdb.raw);
+        seq_printf(s, "%s| Num subcontexts:  %2ld|\n", prefix, hweight64(inst->subcontext_pdb_valid));
+        // Print configuration of every enabled subcontext
+        for (i = 0; i < 64; i++) {
+                // Skip subcontexts without their enable bit set
+                if (!(1 & (inst->subcontext_pdb_valid >> i)))
+                        continue;
+                seq_printf(s, "%s| CPU SC%02d ASID%7d|\n", prefix, i, inst->subcontext[i].pasid);
+                seq_printf(s, "%s| SC%02d PDB PTR:       |\n", prefix, i);
+                seq_printf(s, "%s|   %#018llx|\n", prefix,  ((u64)inst->subcontext[i].pdb.page_dir_hi << 32) | ((u64)inst->subcontext[i].pdb.page_dir_lo << 12));
+                seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->subcontext[i].pdb.target));
+                seq_printf(s, "%s| %20s|\n", prefix, inst->subcontext[i].pdb.is_volatile ? "volatile" : "non-volatile");
+//              seq_printf(s, "%s|raw:       %0#10lx|\n", prefix, inst->subcontext[i].pdb.raw);
+        }
+        // XXX: CTXSW is only accessible via PRAMIN. Accessing PRAMIN appears to
+        // either be broken, or race with the driver on Kepler (gk104 tested). So,
+        // do not attempt to touch the CTXSW block on Kepler.
+        // TODO: This check should be moved into addr_to_pramin_mut().
+        if (g->chip_id < NV_CHIP_ID_MAXWELL)
+                return 0;
+        // End XXX
+        if (IS_ERR(ctxsw = get_ctxsw(g, inst))) {
+#ifdef FALLBACK_TO_PRAMIN
+                nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
+#endif
+                return PTR_ERR(ctxsw);
+        }
+        // If unable to access CTXSW block, skip
+        if (!ctxsw) {
+#ifdef FALLBACK_TO_PRAMIN
+                nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
+#endif
+                return 0;
+        }
+        // Access and print the preemption mode and context ID
+        seq_printf(s, "%s|= Context State =====|\n", prefix);
+        seq_printf(s, "%s| Ctx. ID:  %#10x|\n", prefix, ctxsw->context_id);
+        // No other CTXSW fields are supported pre-Pascal
+        if (g->chip_id < NV_CHIP_ID_PASCAL)
+                return 0;
+        seq_printf(s, "%s| Gfx. Preemption:%4s|\n", prefix,
+                   graphics_preempt_type_to_text(ctxsw->graphics_preemption_options));
+        seq_printf(s, "%s| Cmp. Preemption:%4s|\n", prefix,
+                   compute_preempt_type_to_text(ctxsw->compute_preemption_options));
+        seq_printf(s, "%s| #WFI Saves:%9d|\n", prefix, ctxsw->num_wfi_save_operations);
+        seq_printf(s, "%s| #CTA Saves:%9d|\n", prefix, ctxsw->num_cta_save_operations);
+        seq_printf(s, "%s| #GFXP Saves:%8d|\n", prefix, ctxsw->num_gfxp_save_operations);
+        seq_printf(s, "%s| #CILP Saves:%8d|\n", prefix, ctxsw->num_cilp_save_operations);
+#ifdef FALLBACK_TO_PRAMIN
+        nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
+#endif
+        return 0;
+}
 /* Print channel details using PCCSR (Programmable Channel Control System RAM?)
  @param s      Pointer to state from seq_file subsystem to pass to seq_printf
  @param g      Pointer to our internal GPU state
@@ -32,16 +137,19 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state
        seq_printf(s, "%s|   %#018llx|\n", prefix, instance_ptr);
        seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target));
        seq_printf(s, "%s| Instance bound:    %d|\n", prefix, chan.inst_bind);
-        return 0;
+        // Print instance block
+        return runlist_detail_seq_show_inst(s, g, prefix, instance_ptr, chan.inst_target);
 }
 /* `runlist_detail_seq_show_chan()`, but for Ampere+
+  @param instance_ptr     Address for the channel instance block
+  @param instance_target  Aperture of `instance_ptr`
  @param runlist_pri_base Base of the RLRAM region for this runlist
  `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on
  Ampere+, and its location is configured in Runlist RAM.
 */
-static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base) {
+static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base, uint64_t instance_ptr, enum INST_TARGET instance_target) {
        runlist_channel_config_t channel_config;
        channel_ctrl_ga100_t chan;
@@ -63,7 +171,7 @@ static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug
        seq_printf(s, "%s| PBDMA Busy:        %d|\n", prefix, chan.pbdma_busy);
        seq_printf(s, "%s| ENG Busy:          %d|\n", prefix, chan.eng_busy);
        seq_printf(s, "%s| Acquire Fail:      %d|\n", prefix, chan.acquire_fail);
-        return 0;
+        return runlist_detail_seq_show_inst(s, g, prefix, instance_ptr, instance_target);
 }
 #endif
@@ -173,7 +281,7 @@ static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) {
                if (g->chip_id < NV_CHIP_ID_AMPERE)
                        runlist_detail_seq_show_chan(s, g, chid(g, entry), indt);
                else
-                        runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base);
+                        runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base, instance_ptr, inst_target(g, entry));
 #endif
                seq_printf(s, "%s+---------------------+\n", indt);
        }
@@ -232,15 +340,17 @@ struct file_operations preempt_tsg_file_ops = {
 ssize_t resubmit_runlist_file_write(struct file *f, const char __user *buffer,
                                    size_t count, loff_t *off) {
-        uint32_t target_runlist;
+        uint32_t target_runlist, target_offset;
        struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
-        int err = kstrtou32_from_user(buffer, count, 0, &target_runlist);
+        int err = kstrtou32_from_user(buffer, count, 0, &target_offset);
        if (err)
                return err;
+        // (Ab)use the PDE_DATA field for the runlist ID
+        target_runlist = file2gpuidx(f);
        // resubmit_runlist() checks that target_runlist is valid
-        if ((err = resubmit_runlist(g, target_runlist)))
+        if ((err = resubmit_runlist(g, target_runlist, target_offset)))
                return err;
        return count;
@@ -351,6 +461,54 @@ struct file_operations enable_channel_file_ops = {
        .llseek = default_llseek,
 };
+ssize_t comm_preempt_channel_file_write(struct file *f, const char __user *buf,
+                                        size_t count, loff_t *off,
+                                        enum COMPUTE_PREEMPT_TYPE mode) {
+        uint32_t target_channel, target_runlist;
+        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
+        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
+        int err = kstrtou32_from_user(buf, count, 0, &target_channel);
+        if (err)
+                return err;
+        // (Ab)use the PDE_DATA field used by file2gpuidx() for the runlist ID
+        target_runlist = file2gpuidx(f);
+        // Set preemption mode for the context of this channel
+        if ((err = set_channel_preemption_mode(g, target_channel, target_runlist, mode)))
+                return err;
+        return count;
+}
+ssize_t wfi_preempt_channel_file_write(struct file *f, const char __user *buf,
+                                       size_t count, loff_t *off) {
+        return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_WFI);
+}
+struct file_operations wfi_preempt_channel_file_ops = {
+        .write = wfi_preempt_channel_file_write,
+        .llseek = default_llseek,
+};
+ssize_t cta_preempt_channel_file_write(struct file *f, const char __user *buf,
+                                       size_t count, loff_t *off) {
+        return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_CTA);
+}
+struct file_operations cta_preempt_channel_file_ops = {
+        .write = cta_preempt_channel_file_write,
+        .llseek = default_llseek,
+};
+ssize_t cil_preempt_channel_file_write(struct file *f, const char __user *buf,
+                                       size_t count, loff_t *off) {
+        return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_CILP);
+}
+struct file_operations cil_preempt_channel_file_ops = {
+        .write = cil_preempt_channel_file_write,
+        .llseek = default_llseek,
+};
 // Tested working on Pascal (gp106) through Ada (ad102)
 ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
                                 size_t count, loff_t *off) {
@@ -419,11 +577,13 @@ ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
                // TODO: Fix the above for bare channels. Add "for_chan_until_tsg"?
        }
+#warning switch_to_tsg has preempt_runlist omitted!
+        return count;
        // Resubmit the runlist to ensure that changes to channel enablement are
        // picked up on Turing+ GPUs (channel enablements may not be otherwise).
        if (g->chip_id >= NV_CHIP_ID_TURING)
-                if ((err = resubmit_runlist(g, target_runlist)))
+                if ((err = resubmit_runlist(g, target_runlist, -1)))
                        return err;
        // Trigger a runlist-level preempt to stop whatever was running, triggering
@@ -438,3 +598,470 @@ struct file_operations switch_to_tsg_file_ops = {
        .write = switch_to_tsg_file_write,
        .llseek = default_llseek,
 };
+ssize_t preempt_runlist_file_write(struct file *f, const char __user *buffer,
+                                    size_t count, loff_t *off) {
+        uint32_t target_runlist;
+        struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
+        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
+        int err = kstrtou32_from_user(buffer, count, 0, &target_runlist);
+        if (err)
+                return err;
+        // TODO: Check runlist is in-range
+        if ((err = preempt_runlist(g, target_runlist)))
+                return err;
+        return count;
+}
+struct file_operations preempt_runlist_file_ops = {
+        .write = preempt_runlist_file_write,
+        .llseek = default_llseek,
+};
+// Value written to this file is which runlist to ack the IRQ for
+ssize_t ack_bad_tsg_file_write(struct file *f, const char __user *buffer,
+                                    size_t count, loff_t *off) {
+        uint32_t target_runlist;
+        uint32_t rl_ram_off;
+        struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
+        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
+        int err = kstrtou32_from_user(buffer, count, 0, &target_runlist);
+        if (err)
+                return err;
+        if ((err = get_runlist_ram(g, target_runlist, &rl_ram_off)))
+                return err;
+        nvdebug_writel(g, rl_ram_off + 0x100, 1 << 12);
+        return count;
+}
+struct file_operations ack_bad_tsg_file_ops = {
+        .write = ack_bad_tsg_file_write,
+        .llseek = default_llseek,
+};
+// Rather than mapping all of BAR0, we just map:
+// - On Pascal, Volta, Turing: MC_BOOT, PFIFO, PCCSR, PTOP
+// - On Ampere: MC_BOOT, RAMRL(0), CHRAM(0), PTOP
+// "All CUDA-managed pointers are within---the first 40 bits of the process's
+// VA space" (Sec. 4.1, GPUDirect RDMA Documentation)
+// - This means 0x00ff_ffff_ffff is the highest valid CUDA virtual address,
+//   and all higher addresses are unused.
+// - So we use 0x6000_0000_0000+; this falls within the first PDE3 entry, and
+//   at the end of the PDE2 entries
+//   + Using the second PDE3 entry did not appear to work on Jetson (IIRC)
+#define BAR0_USER_ADDR 0x0000700000000000llu
+#define MEM_USER_ADDR 0x0000600000000000llu
+/* Map all of GPU VRAM, and selected BAR0 regions, into a channel instance's
+ * virtual address space at predefined offsets (above).
+ *
+ * @param g        Pointer to the nvdebug state for the selected GPU
+ * @param inst_ptr Dereferencible pointer to the channel's instance block
+ * @returns 0 on success, -errno on error
+ *
+ * Support: Pascal, Volta, Turing, Ampere
+ */
+int map_mem_for_instance(struct nvdebug_state *g, instance_ctrl_t *inst_ptr) {
+        int ret;
+        uintptr_t off, ram_size;
+        dma_addr_t bus_mc_boot_ram, bus_ptop_ram, bus_fifo_ram, bus_chan_ctrl_ram;
+        uint64_t mc_boot_ram, ptop_ram, fifo_ram, chan_ctrl_ram;
+        page_dir_config_t chan_pd_config;
+        memory_range_t mem_range;
+        uint32_t channel_ram_off, runlist_ram_off, channel_ram_size, bar0_base;
+        struct iommu_domain *dom;
+        if (g->chip_id >= NV_CHIP_ID_AMPERE) {
+                runlist_channel_config_t channel_config;
+                if ((ret = get_runlist_ram(g, 0, &runlist_ram_off))) {
+                        printk(KERN_ERR "[nvdebug] %s: Unable to determine location of runlist0 RAM!\n", __func__);
+                        return ret;
+                }
+                if (runlist_ram_off & 0xfff) {
+                        printk(KERN_ERR "[nvdebug] %s: Runlist0 RAM is not page-aligned!\n", __func__);
+                        return -EAFNOSUPPORT;
+                }
+                if ((channel_config.raw = nvdebug_readl(g, runlist_ram_off + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1)
+                        return -EIO;
+                channel_ram_off = (uint32_t)channel_config.bar0_offset << 4;
+                if (channel_ram_off & 0xfff) {
+                        printk(KERN_ERR "[nvdebug] %s: Runlist0 CHRAM is not page-aligned!\n", __func__);
+                        return -EAFNOSUPPORT;
+                }
+                channel_ram_size = (1 << channel_config.num_channels_log2) * sizeof(channel_ctrl_ga100_t);
+                printk(KERN_DEBUG "[nvdebug] %s: Mapping CHRAM at %#018llx--%x and RLRAM at %#018llx--%x.\n", __func__, BAR0_USER_ADDR + channel_ram_off, channel_ram_size-1, BAR0_USER_ADDR + runlist_ram_off, 4095);
+        } else {
+                channel_ram_off = NV_PCCSR;
+                // MAX_CHID * sizeof(channel_ctrl_gf100_t) is < 4 KiB, so hardcode
+                channel_ram_size = 4096;
+                runlist_ram_off = NV_PFIFO;
+        }
+        // map_mem_by_chid() pulls the instance block via PRAMIN, so inst_ptr will
+        // be invalid after moving PRAMIN (eg. as part of a page table operation).
+        // To avoid accessing inst_ptr after invalidation, keep a copy of what we
+        // need.
+        chan_pd_config = inst_ptr->pdb;
+        // map_page_directory_v1() is unimplemented, precluding Maxwell (or older)
+        // support (as they don't support v2 page tables).
+        if (!chan_pd_config.is_ver2)
+                return -EOPNOTSUPP;
+        // Determine the size of GPU physical memory (VRAM).
+        if ((mem_range.raw = nvdebug_readl(g, NV_FB_MMU_LOCAL_MEMORY_RANGE)) == -1)
+                return -EIO;
+        ram_size = memory_range_to_bytes(mem_range);
+        // We map memory using huge pages, and thus do not support GPUs with
+        // non-2-MiB-divisible VID_MEM sizes.
+        if (ram_size % (1 << 21) != 0) {
+                printk(KERN_ERR "[nvdebug] %s: GPU VID_MEM of %lu bytes is not a multiple of 2 MiB!\n", __func__, ram_size);
+                return -EAFNOSUPPORT;
+        }
+        // Map all of physical GPU memory (VID_MEM) into this channels's GPU virtual
+        // address space using huge (2 MiB) pages.
+        for (off = 0; off < ram_size; off += (1 << 21)) {
+                if ((ret = map_page_directory(g, chan_pd_config,
+                                        MEM_USER_ADDR + off, off, TARGET_VID_MEM, true)) < 0)
+                        return ret;
+                // If the mapping already exists for this page directory, the other
+                // mappings should already exist, and can be skipped.
+                if (ret == 1) {
+                        printk(KERN_INFO "[nvdebug] %s: VRAM mapping from %llx to %lx already exists. Assuming all mappings already exist and returning early...\n", __func__, MEM_USER_ADDR + off, off);
+                        return 0;
+                }
+        }
+        // Map Channel RAM to a GPU-accessible bus address (gets past any IOMMU or
+        // IOVA layers), then map that address into this channel's GPU virtual
+        // address space. NV_PCCSR_CHANNEL_INST(0) is 4k-aligned, so it can be
+        // directly mapped.
+        // XXX: All these mappings are currently returning -1 on all reads on
+        //      sunlight, jbakita-old, jetson-xavier, jetson-orin, and bonham,
+        //      which seems to be returned from the PCIe root (on PCIe GPUs).
+        if (g->pcid)
+                bar0_base = pci_resource_start(g->pcid, 0);
+        else if (g->platd)
+                bar0_base = platform_get_resource(g->platd, IORESOURCE_MEM, 0)->start;
+        else
+                return -ENOTRECOVERABLE;
+        mc_boot_ram = NV_MC_BOOT_0 + bar0_base;
+        // PTOP fits within a page, but not page-aligned; round down.
+        ptop_ram = (NV_PTOP & ~0xfffu) + bar0_base;
+        fifo_ram = runlist_ram_off + bar0_base;
+        chan_ctrl_ram = channel_ram_off + bar0_base;
+        // Check if GPU-accessible bus addresses are the same as CPU-visible physical
+        // addresses. Logic from amdgpu_device_check_iommu_direct_map().
+        dom = iommu_get_domain_for_dev(g->dev);
+        if (!dom || dom->type == IOMMU_DOMAIN_IDENTITY) {
+                // Used for: jbakita-old, sunlight, jetson-xavier, jetson-orin integrated, bonham, ?
+                // (For all these, reads on the mapping return only -1.)
+                // (Forcing these through dma_map_resource()/iommu_map() changes nothing)
+                // (Note that the `ls -l /sys/class/iommu/*/devices` also reports that the
+                // GPU is not available under the I/O MMU on these platforms.)
+                // To fix this, please enable AMD-Vi/ARM SMMU/Intel VT-d in your BIOS
+                // settings, UEFI settings, or device-tree file. Supported on:
+                // - AMD: Bulldozer+ (or Phenom II w/ 890FX or 990FX Chipset)
+                // - Intel: Most since Core2 Duo
+                // Note that while the Jetson Orin has an SMMU (I/O MMU), the GPU does not
+                // appear to be configured by any pre-provided device tree files to use the
+                // SMMU.
+                printk(KERN_INFO "[nvdebug] map_mem_ctxid: I/O MMU is unavailable/disabled for GPU %x. Assuming phys and bus addresses are identical...\n", g->chip_id);
+                bus_mc_boot_ram = mc_boot_ram;
+                bus_ptop_ram = ptop_ram;
+                bus_fifo_ram = fifo_ram;
+                bus_chan_ctrl_ram = chan_ctrl_ram;
+        } else {
+                printk(KERN_INFO "[nvdebug] map_mem_ctxid: I/O MMU is enabled. Attempting to use dma_map_resource()...\n");
+                // Used for: tama, yamaha
+                // Fails on tama, yamaha
+                // (Works on jetson-xavier, jetson-orin and bonham, but appears to be a no-op, and
+                // yields inaccessible memory. Get `mc-err: (255) csr_nvl7r: EMEM address decode error`
+                // on access on jetson boards, and a -1 read on all.)
+                bus_mc_boot_ram = dma_map_resource(g->dev, mc_boot_ram, 4096*2 /* *2 is a XXX hack to include PBUS */, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+                bus_ptop_ram = dma_map_resource(g->dev, ptop_ram, 4096, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+                bus_fifo_ram = dma_map_resource(g->dev, fifo_ram, 4096*8 /* *8 is a XXX hack */, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+                bus_chan_ctrl_ram = dma_map_resource(g->dev, chan_ctrl_ram, 2*4096, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+                if (dma_mapping_error(g->dev, bus_mc_boot_ram) ||
+                        dma_mapping_error(g->dev, bus_ptop_ram) ||
+                        dma_mapping_error(g->dev, bus_fifo_ram) ||
+                        dma_mapping_error(g->dev, bus_chan_ctrl_ram)) {
+                        // Used for: tama, yamaha
+                        printk(KERN_WARNING "[nvdebug] map_mem_ctxid: Unable to map BAR0 addresses to device-accessible addresses via dma_map_resource(). Return codes: %d for MC_BOOT, %d for PFIFO, %d for PCCSR.\n",
+                                   dma_mapping_error(g->dev, bus_mc_boot_ram),
+                                   dma_mapping_error(g->dev, bus_fifo_ram),
+                                   dma_mapping_error(g->dev, bus_chan_ctrl_ram));
+                        // This fallback does not appear to work on jbakita-old (5.4, GART IOMMU), but works on tama
+                        if (!get_dma_ops(g->dev))
+                                printk(KERN_WARNING "[nvdebug] Reason: No DMA `ops`, and direct mapping failed.\n");
+                        else if (!get_dma_ops(g->dev)->map_resource)
+                                // Fires on: tama, yamaha
+                                printk(KERN_WARNING "[nvdebug] Reason: `map_resource` function undefined on this platform.\n");
+                        if (!dom) {
+                                printk(KERN_ERR "[nvdebug] map_mem_ctxid: No I/O MMU available and dma_map_resource() failed. Aborting mapping of BAR0 regions!\n");
+                                return -ENOTRECOVERABLE;
+                        }
+                        printk(KERN_INFO "[nvdebug] map_mem_ctxid: Trying to fall back to direct I/O MMU manipulation...\n");
+                        // XXX: Fallback to directly creating the I/O MMU mappings.
+                        //      This is necessary. Directly accessing BAR0 addresses throws I/O MMU
+                        //      errors in the kernel log on yamaha.
+                        // See also: comment on kfd_mem_dmamap_sg_bo() in amdgpu
+                        // Note: dma_map_resource -> map_resource -> [arm_]iommu_map_resource
+                        //       -> __iommu_dma_map -> iommu_map is the happy-path, but this seems to
+                        //       regularly fail, even though the iommu_map path works. One key
+                        //       difference is that the dma_map_resource() path also includes
+                        //       IOMMU_MMIO in the iommu_map() flags.
+                        bus_mc_boot_ram = mc_boot_ram;
+                        bus_ptop_ram = ptop_ram;
+                        bus_fifo_ram = fifo_ram;
+                        bus_chan_ctrl_ram = chan_ctrl_ram;
+                        // Create identity mapping
+                        ret = iommu_map(dom, mc_boot_ram, mc_boot_ram, 4096*2 /* *2 is a hack to fit in PBUS*/, IOMMU_READ | IOMMU_WRITE);
+                        if (ret < 0) {
+                                printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for MC_BOOT!\n");
+                                return ret;
+                        }
+                        ret = iommu_map(dom, ptop_ram, ptop_ram, 4096, IOMMU_READ | IOMMU_WRITE);
+                        if (ret < 0) {
+                                printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for PTOP!\n");
+                                return ret;
+                        }
+                        ret = iommu_map(dom, fifo_ram, fifo_ram, 4096*8 /* *8 is XXX hack*/, IOMMU_READ | IOMMU_WRITE);
+                        if (ret < 0) {
+                                printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for FIFO!\n");
+                                return ret;
+                        }
+                        ret = iommu_map(dom, chan_ctrl_ram, chan_ctrl_ram, channel_ram_size, IOMMU_READ | IOMMU_WRITE);
+                        if (ret < 0) {
+                                printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for PCCSR!\n");
+                                return ret;
+                        }
+                }
+        }
+        // TARGET_SYS_MEM_NONCOHERENT tells the GPU to bypass the CPU L2 cache for
+        // accesses to this memory.
+        // "Clients should normally use [SYS_MEM_NON_COHERENT]" (nvgpu)
+        //
+        // "Non-coherent system memory.
+        //  (GPU) MMU will NOT maintain coherence with CPU L2 cache.
+        //  Higher-level APIs should only allow this when it is known
+        //  the memory is not cacheable by CPU or the coherency is
+        //  managed explicitly (e.g. w/ flushes in SW).
+        //  Also consider that this path is not necessarily faster." (open-gpu-kernel-modules)
+        //
+        // "Coherent system memory.
+        //  (GPU) MMU will snoop CPU L2 cache if possible.
+        //  This is usually the safer choice over NONCOH since it works
+        //  whether the memory is cached by CPU L2 or not.
+        //  On some CPU architectures going through CPU L2 may
+        //  even be faster than the non-coherent path." (open-gpu-kernel-modules)
+        //
+        // I suspect that that for SYS_MEM_NONCOHERENT mappings, the "no snoop"
+        // attribute bit will be set on associated PCIe read/write transactions.
+        //
+        // The only other bits in a PCIe read/write transaction that could be
+        // relevant are the two AT (Address Translation) bits added in PCIe 2.0.
+        if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + NV_MC_BOOT_0,
+                     bus_mc_boot_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                return ret;
+        // XXX
+        if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + NV_MC_BOOT_0 + 4096,
+                     bus_mc_boot_ram + 4096, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                return ret;
+        if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + (NV_PTOP & ~0xfffu),
+                     bus_ptop_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                return ret;
+        if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + runlist_ram_off,
+                     bus_fifo_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                return ret;
+        // XXX
+        for (off = 4096; off < 8*4096; off += 4096)
+                if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + runlist_ram_off+off,
+                                         bus_fifo_ram+off, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                        return ret;
+        // Channel control RAM can span two or more pages on Ampere+
+        for (off = 0; off < channel_ram_size; off += 4096)
+                if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + channel_ram_off + off,
+                             bus_chan_ctrl_ram + off, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                        return ret;
+        return 0;
+}
+// Map by context ID
+// See constituent functions for info on what they do; comments not repeated.
+// Tested on Pascal, Volta, Turing, and Kepler
+ssize_t map_mem_ctxid_file_write(struct file *f, const char __user *buffer,
+                                   size_t count, loff_t *off) {
+        int err, target_context, target_runlist;
+        loff_t pos;
+        uint64_t instance_ptr;
+        enum INST_TARGET instance_target;
+        struct runlist_iter rl_iter;
+        instance_ctrl_t *inst;
+        context_switch_ctrl_t *ctx_block;
+        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
+        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
+        if ((err = kstrtou32_from_user(buffer, count, 0, &target_context)))
+                return err;
+        target_runlist = file2gpuidx(f);
+        // Get dereferencable pointer to the runlist
+        if ((err = get_runlist_iter(g, target_runlist, &rl_iter)))
+                return err;
+        // Find a channel in the runlist matching the provided context ID
+        for (pos = 0; pos < rl_iter.len; pos++, rl_iter.curr_entry += NV_RL_ENTRY_SIZE(g)) {
+                uint32_t ctxsw_timeout_pri_base = NV_PFIFO_ENG_CTXSW_TIMEOUT;
+                if (entry_type(g, rl_iter.curr_entry) == ENTRY_TYPE_TSG)
+                        continue;
+                // Get instance block address
+                if (g->chip_id >= NV_CHIP_ID_AMPERE) {
+                        instance_ptr = ((struct gv100_runlist_chan*)rl_iter.curr_entry)->inst_ptr_hi;
+                        instance_ptr <<= 32;
+                        instance_ptr |= (uint64_t)inst_ptr_lo(g, rl_iter.curr_entry) << 12;
+                        instance_target = inst_target(g, rl_iter.curr_entry);
+                        ctxsw_timeout_pri_base = rl_iter.runlist_pri_base + NV_RUNLIST_ENGINE_CTXSW_TIMEOUT_CONFIG(0);
+                } else {
+                        channel_ctrl_t chan;
+                        chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chid(g, rl_iter.curr_entry)));
+                        if (chan.raw == -1)
+                                return -EIO;
+                        instance_ptr = (uint64_t)chan.inst_ptr << 12;
+                        instance_target = chan.inst_target;
+                }
+                // Skip channels with unconfigured or INVALID instance blocks
+                if (!instance_ptr || instance_target == 1) {
+                        printk(KERN_WARNING "[nvdebug] Channel %d is in runlist %d, but "
+                               "lacks a valid instance block", chid(g, rl_iter.curr_entry),
+                               target_runlist);
+                        continue;
+                }
+                // Get a dereferencable pointer to the instance block
+                if (IS_ERR(inst = instance_deref(g, instance_ptr, instance_target)))
+                        return PTR_ERR(inst);
+                // If unable to access instance block, skip
+                if (!inst)
+                        continue;
+                // Get dereferencable pointer to CTXSW block
+                if (IS_ERR(ctx_block = get_ctxsw(g, inst)))
+                        return PTR_ERR(ctx_block);
+                // If unable to access CTXSW block, skip
+                if (!ctx_block)
+                        continue;
+                // Check if the context ID matches
+                if (ctx_block->context_id != target_context)
+                        continue;
+                // XXX: Disable the context switch timeout while we're here
+                ctxsw_timeout_t timeout_config;
+                if ((timeout_config.raw = nvdebug_readl(g, ctxsw_timeout_pri_base)) == -1)
+                        return -EIO;
+                timeout_config.enabled = 0;
+                nvdebug_writel(g, ctxsw_timeout_pri_base, timeout_config.raw);
+                // XXX: Attempt setting preemption mode while we're here
+                ctx_block->compute_preemption_options = PREEMPT_CTA;
+                // Map memory and return
+                if ((err = map_mem_for_instance(g, inst)) < 0)
+                        return err;
+                return count;
+        }
+        return -ESRCH;
+}
+struct file_operations map_mem_ctxid_file_ops = {
+        .write = map_mem_ctxid_file_write,
+        .llseek = default_llseek,
+};
+// Map by channel ID (LEGACY; unclear if this needs to be kept)
+// Support: Pascal, Volta, and Turing only
+ssize_t map_mem_chid_file_write(struct file *f, const char __user *buffer,
+                                   size_t count, loff_t *off) {
+        int ret, target_channel;
+        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
+        channel_ctrl_t chan;
+        instance_ctrl_t *inst_ptr;
+        bool all = false;
+        uint64_t inst_ptr_off;
+        page_dir_config_t bar2_pd_config;
+        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
+        if ((ret = kstrtos32_from_user(buffer, count, 0, &target_channel)))
+                return ret;
+        if (g->chip_id >= NV_CHIP_ID_AMPERE)
+                return -ENOSYS;
+        // This API is for nvsched, which is only supported on GPUs which support
+        // instruction-level preemption (Pascal+).
+        if (g->chip_id < NV_CHIP_ID_PASCAL)
+                return -EOPNOTSUPP;
+        if (target_channel > MAX_CHID)
+                return -ERANGE;
+        // Passing -1 indicates that all channels should be mapped
+        if (target_channel == -1) {
+                all = true;
+                target_channel = 0;
+        }
+        do {
+                printk(KERN_INFO "[nvdebug] Mapping channel %d\n", target_channel);
+                // Read the channel's configuration block, which includes the address of
+                // this channel's instance block, which contains a page table pointer.
+                // TODO: Verify this works with the channel RAM changes on Ampere+
+                chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel));
+                if (chan.raw == -1)
+                        return -EIO;
+                // If the instance pointer is unconfigured or the target is 1 (INVALID),
+                // this channel is not in-use on any runlist and can be skipped.
+                if (chan.inst_ptr == 0 || chan.inst_target == 1)
+                        continue;
+                // Find page tables which define how BAR2 offsets are tranlated to physical
+                // VID_MEM/SYS_MEM addresses. (We have to do this every time since we reset
+                // PRAMIN.)
+                if ((ret = get_bar2_pdb(g, &bar2_pd_config)) < 0)
+                        return ret;
+                // Pascal+ GPUs use Version 2 page tables, so this shouldn't be a problem
+                if (!bar2_pd_config.is_ver2)
+                        return -ENOSYS;
+                // To read the instance block, first find where it is mapped in BAR2
+                if ((inst_ptr_off = search_page_directory(g, bar2_pd_config, (u64)chan.inst_ptr << 12, chan.inst_target)) == 0) {
+                        // If no mapping can be found in BAR2, fallback to accessing the
+                        // instance block via the PRAMIN window.
+                        printk(KERN_WARNING "[nvdebug] Warning: Channel %d has no instance "
+                                   "block mapped in BAR2. Falling back to PRAMIN...\n", target_channel);
+                        if ((ret = addr_to_pramin_mut(g, (u64)chan.inst_ptr << 12, chan.inst_target)) < 0)
+                                return -EOPNOTSUPP;
+                        inst_ptr = g->regs + NV_PRAMIN + ret;
+                } else {
+                        inst_ptr = g->bar2 + inst_ptr_off;
+                }
+                if ((ret = map_mem_for_instance(g, inst_ptr)))
+                        return ret;
+        // If mapping all channels, start again at the next one
+        } while (all && ++target_channel <= MAX_CHID);
+        return count;
+}
+struct file_operations map_mem_chid_file_ops = {
+        .write = map_mem_chid_file_write,
+        .llseek = default_llseek,
+};
author	Joshua Bakita <bakitajoshua@gmail.com>	2025-05-05 03:53:01 -0400
committer	Joshua Bakita <bakitajoshua@gmail.com>	2025-05-05 03:53:13 -0400
commit	293430fcb5d4013b573556c58457ee706e482b7f (patch)
tree	9328fa680f55b4e1a08d24714275b8437be3be5d
parent	494df296bf4abe9b2b484bde1a4fad28c989afec (diff)